Ported back some AVX256 optimizations to SSE2/SSE4.1/AVX128.
authorErik Lindahl <erik@kth.se>
Mon, 26 Nov 2012 17:56:49 +0000 (18:56 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 28 Nov 2012 21:30:50 +0000 (22:30 +0100)
We can gain a couple of percent in the group kernels for some cases
by only storing j-particle forces once in water kernels, and
I have updated the last few routines that used constructs that
some compliers considered fragile.

Change-Id: Ic69307c735c57dd92bf0b35e88550252447e4f7f

126 files changed:
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre
src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre

index 083a3f37e6cb99866deb3057263fb3888f15db7a..8fe321d85cb9078763daf769b05c822a24afc382 100644 (file)
@@ -105,7 +105,6 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
                              __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
 {
     __m128 t1,t2,t3,t4;
-    /* NOTE: This might be faster with _mm_loadu_ps() */
     t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);
     t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);
     t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);
index ab0d6af379f2be22b94a05aa5cb189b5f8c28e78..ef8ce2f9dbbf0cf04ac83ba90b3281c8453a1b27 100644 (file)
@@ -214,6 +214,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -290,14 +294,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -341,14 +340,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -392,14 +386,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 168 flops */
         }
@@ -459,6 +455,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -540,14 +540,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -595,14 +590,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -650,14 +640,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 171 flops */
         }
@@ -862,6 +854,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -927,14 +923,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -973,14 +964,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1019,14 +1005,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 148 flops */
         }
@@ -1086,6 +1074,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1154,14 +1146,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1203,14 +1190,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1252,14 +1234,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 151 flops */
         }
index 0d67362bfb330d32eac49fad081981244f87a877..c08561fb575ffce840dfcea17e71463fa80cadd4 100644 (file)
@@ -225,6 +225,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -287,14 +291,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -338,14 +337,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -389,14 +383,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -440,14 +429,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 197 flops */
         }
@@ -512,6 +503,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -578,14 +573,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -633,14 +623,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -688,14 +673,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -743,14 +723,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 201 flops */
         }
@@ -966,6 +948,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1020,14 +1006,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1066,14 +1047,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1112,14 +1088,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1158,14 +1129,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 177 flops */
         }
@@ -1230,6 +1203,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1287,14 +1264,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1336,14 +1308,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1385,14 +1352,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1434,14 +1396,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 181 flops */
         }
index a9061d8c0bab45517e6f71c3b895567922878a88..9d5348832a4274ec241e31a240ed0928ec169e4d 100644 (file)
@@ -216,6 +216,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -272,14 +276,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -323,14 +322,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -374,14 +368,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 151 flops */
         }
@@ -443,6 +439,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -504,14 +504,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -559,14 +554,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -614,14 +604,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 154 flops */
         }
@@ -828,6 +820,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -875,14 +871,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -921,14 +912,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -967,14 +953,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 134 flops */
         }
@@ -1036,6 +1024,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1086,14 +1078,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1135,14 +1122,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1184,14 +1166,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 137 flops */
         }
index a02d2794328a24d60aa576d4cf330eba8bec29ec..06797b62da776678684ef8437d6bdb1da48ee14d 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -255,14 +259,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -306,14 +305,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -357,14 +351,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -408,14 +397,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 173 flops */
         }
@@ -481,6 +472,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -513,14 +508,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -568,14 +558,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -623,14 +608,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -678,14 +658,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 176 flops */
         }
@@ -902,6 +884,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -925,14 +911,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -971,14 +952,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1017,14 +993,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1063,14 +1034,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 156 flops */
         }
@@ -1136,6 +1109,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1161,14 +1138,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1210,14 +1182,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1259,14 +1226,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1308,14 +1270,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 159 flops */
         }
index 10342092f9a76082052d261c1918a34d1856bb69..5fe6ef95d97ebd3911c6b7b2325bcd38de496d5d 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -241,14 +245,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -292,14 +291,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -343,14 +337,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -406,6 +402,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -452,14 +452,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -507,14 +502,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -562,14 +552,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -759,6 +751,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -796,14 +792,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -842,14 +833,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -888,14 +874,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -951,6 +939,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -991,14 +983,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1040,14 +1027,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1089,14 +1071,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 129 flops */
         }
index 5880520b6eb7346b91a93507541886393ce3e8b4..96dc15a0c74b8e09a8d97fe8600521f60b0b5fc7 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -241,14 +245,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -292,14 +291,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -343,14 +337,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -406,6 +402,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -452,14 +452,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -507,14 +502,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -562,14 +552,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -759,6 +751,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -796,14 +792,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -842,14 +833,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -888,14 +874,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -951,6 +939,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -991,14 +983,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1040,14 +1027,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1089,14 +1071,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 129 flops */
         }
index 19946d1d0992e2cbd4cee7bf39dff98c0df3489e..a5fc5393810f03a0966e449254ae1cf67af84594 100644 (file)
@@ -218,6 +218,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -285,14 +289,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -315,14 +314,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -345,14 +339,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 128 flops */
         }
@@ -416,6 +412,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -488,14 +488,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -521,14 +516,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -554,14 +544,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 129 flops */
         }
@@ -770,6 +762,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -828,14 +824,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -855,14 +846,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -882,14 +868,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -953,6 +941,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1014,14 +1006,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1043,14 +1030,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1072,14 +1054,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 118 flops */
         }
index 5d62c2b395a5b385603c0473ca2b2e6918dc0e10..b36881288d3eeec3c0138536d5522636cfa1088e 100644 (file)
@@ -229,6 +229,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -290,14 +294,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -320,14 +319,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -350,14 +344,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -380,14 +369,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 152 flops */
         }
@@ -456,6 +447,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -521,14 +516,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -554,14 +544,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -587,14 +572,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -620,14 +600,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 153 flops */
         }
@@ -847,6 +829,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -900,14 +886,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -927,14 +908,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -954,14 +930,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -981,14 +952,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -1057,6 +1030,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1113,14 +1090,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1142,14 +1114,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1171,14 +1138,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1200,14 +1162,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 142 flops */
         }
index ca90af63c8af683635c2cb83326827af75a40a21..17f8a6367e2bfc55f2f1e51461cccbef4777de27 100644 (file)
@@ -211,6 +211,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -246,14 +250,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -276,14 +275,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -306,14 +300,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 105 flops */
         }
@@ -377,6 +373,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -416,14 +416,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -449,14 +444,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -482,14 +472,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 105 flops */
         }
@@ -691,6 +683,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -719,14 +715,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -746,14 +737,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -773,14 +759,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 97 flops */
         }
@@ -844,6 +832,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -874,14 +866,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -903,14 +890,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -932,14 +914,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 97 flops */
         }
index 829e664ae3d703bfd953ca34a51e1cd8a5cf4d70..050a9e775ca23e9bd611c2dd1b3fd98c5ff293b5 100644 (file)
@@ -222,6 +222,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -251,14 +255,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -281,14 +280,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -311,14 +305,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -341,14 +330,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 128 flops */
         }
@@ -417,6 +408,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -449,14 +444,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -482,14 +472,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -515,14 +500,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -548,14 +528,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 128 flops */
         }
@@ -768,6 +750,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -791,14 +777,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -818,14 +799,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -845,14 +821,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,14 +843,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
@@ -948,6 +921,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -973,14 +950,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1002,14 +974,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1031,14 +998,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1060,14 +1022,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index 0815eb49abe35b678a29aeafc701f562cc4f5df2..9752bf6f869cd0c7c401b43a347cb50f964af31f 100644 (file)
@@ -196,6 +196,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -217,14 +221,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -247,14 +246,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -277,14 +271,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 93 flops */
         }
@@ -344,6 +340,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -368,14 +368,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -401,14 +396,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -434,14 +424,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 93 flops */
         }
@@ -628,6 +620,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -646,14 +642,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -673,14 +664,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -700,14 +686,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -767,6 +755,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -787,14 +779,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -816,14 +803,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -845,14 +827,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
index 83e3ebde56d57549fa1790e471c77db4a9487c17..672473b0503f23872018c2c28dce66a237b13239 100644 (file)
@@ -196,6 +196,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -217,14 +221,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -247,14 +246,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -277,14 +271,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 93 flops */
         }
@@ -344,6 +340,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -368,14 +368,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -401,14 +396,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -434,14 +424,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 93 flops */
         }
@@ -628,6 +620,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -646,14 +642,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -673,14 +664,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -700,14 +686,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -767,6 +755,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -787,14 +779,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -816,14 +803,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -845,14 +827,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
index a093338afdd293fcf5b6bb43faca95d46630bf4b..72e6ddf8b20cf842f713bd10dd6eeeaa5afae67d 100644 (file)
@@ -231,6 +231,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -286,14 +290,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -336,14 +335,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -386,16 +380,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -459,6 +455,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -519,14 +519,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -573,14 +568,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -627,16 +617,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
@@ -858,6 +850,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -900,14 +896,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -943,14 +934,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -986,16 +972,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 100 flops */
         }
@@ -1059,6 +1047,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1104,14 +1096,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1150,14 +1137,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1196,16 +1178,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 103 flops */
         }
index 2d32c57688344385a7d0acf54c6135e4de41a581..8fc1f35da3967be2b52e9524b234ede9d8c212dd 100644 (file)
@@ -242,6 +242,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -280,14 +284,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -330,14 +329,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -380,14 +374,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -430,16 +419,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 143 flops */
         }
@@ -508,6 +499,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -549,14 +544,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -603,14 +593,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -657,14 +642,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -711,16 +691,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 146 flops */
         }
@@ -953,6 +935,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -983,14 +969,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1026,14 +1007,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1069,14 +1045,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1112,16 +1083,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -1190,6 +1163,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1222,14 +1199,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1268,14 +1240,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1314,14 +1281,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1360,16 +1322,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 129 flops */
         }
index 755f8c8b7c4352d20af5833af67bdc30858770f4..adefdb716a8d25d993ca4588e4979e31f54459c4 100644 (file)
@@ -213,6 +213,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -252,14 +256,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -302,14 +301,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -352,16 +346,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 99 flops */
         }
@@ -421,6 +417,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -464,14 +464,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -518,14 +513,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -572,16 +562,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 102 flops */
         }
@@ -785,6 +777,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -817,14 +813,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -860,14 +851,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -903,16 +889,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 93 flops */
         }
@@ -972,6 +960,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1007,14 +999,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1053,14 +1040,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1099,16 +1081,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 96 flops */
         }
index 45126af5172a7356b75ac34fc6aa2f721ac33947..8fb3cc604841dc68225443ad67f4baccdba809b7 100644 (file)
@@ -213,6 +213,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -252,14 +256,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -302,14 +301,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -352,16 +346,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 99 flops */
         }
@@ -421,6 +417,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -464,14 +464,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -518,14 +513,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -572,16 +562,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 102 flops */
         }
@@ -785,6 +777,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -817,14 +813,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -860,14 +851,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -903,16 +889,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 93 flops */
         }
@@ -972,6 +960,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1007,14 +999,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1053,14 +1040,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1099,16 +1081,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 96 flops */
         }
index 3989bc64b6f4764daaf20eef49dc016ff5b15371..6b7240e93a46104495ed3dc8b76c6c29f3a0369a 100644 (file)
@@ -242,6 +242,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -309,14 +313,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -370,14 +369,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -431,16 +425,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 177 flops */
         }
@@ -504,6 +500,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -576,14 +576,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -641,14 +636,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -706,16 +696,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 180 flops */
         }
@@ -948,6 +940,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1007,14 +1003,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1063,14 +1054,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1119,16 +1105,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 165 flops */
         }
@@ -1192,6 +1180,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1254,14 +1246,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1313,14 +1300,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1372,16 +1354,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 168 flops */
         }
index 1a58d0fe825690c2f615f555d32266a36c6f82de..bb098c6bddd03f0efb021f228b5276a02553c8f4 100644 (file)
@@ -254,6 +254,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -304,14 +308,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -365,14 +364,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -426,14 +420,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -487,16 +476,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 221 flops */
         }
@@ -566,6 +557,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -620,14 +615,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -685,14 +675,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -750,14 +735,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -815,16 +795,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 225 flops */
         }
@@ -1069,6 +1051,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1114,14 +1100,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1170,14 +1151,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1226,14 +1202,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1282,16 +1253,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 209 flops */
         }
@@ -1361,6 +1334,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1409,14 +1386,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1468,14 +1440,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1527,14 +1494,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1586,16 +1548,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 213 flops */
         }
index 6f8f2605d4fe1883274b67e128effd7e9d8ba974..ce9ce9bdc6bd5bad28b0bb60f9f44460dc044fd4 100644 (file)
@@ -227,6 +227,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -277,14 +281,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -338,14 +337,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -399,16 +393,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 159 flops */
         }
@@ -468,6 +464,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -522,14 +522,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -587,14 +582,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -652,16 +642,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 162 flops */
         }
@@ -879,6 +871,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -924,14 +920,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -980,14 +971,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1036,16 +1022,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 150 flops */
         }
@@ -1105,6 +1093,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1153,14 +1145,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1212,14 +1199,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1271,16 +1253,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 153 flops */
         }
index a5ab176464adc91fb35d7fc8f2482c5f873952cd..2f319cb1356579adcca17cf91646ccfee4325fdd 100644 (file)
@@ -227,6 +227,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -277,14 +281,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -338,14 +337,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -399,16 +393,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 159 flops */
         }
@@ -468,6 +464,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -522,14 +522,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -587,14 +582,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -652,16 +642,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 162 flops */
         }
@@ -879,6 +871,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -924,14 +920,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -980,14 +971,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1036,16 +1022,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 150 flops */
         }
@@ -1105,6 +1093,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1153,14 +1145,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1212,14 +1199,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1271,16 +1253,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 153 flops */
         }
index 0de54cb05fc28d65e87dd785a862b28545686848..b4e1e065ff02194a02a79b85cb2908f651acb2ba 100644 (file)
@@ -230,6 +230,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -305,14 +309,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -345,14 +344,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -385,14 +379,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 121 flops */
         }
@@ -456,6 +452,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -536,14 +536,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -580,14 +575,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -624,14 +614,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 124 flops */
         }
@@ -852,6 +844,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -915,14 +911,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -949,14 +940,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -983,14 +969,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 110 flops */
         }
@@ -1054,6 +1042,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1120,14 +1112,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1157,14 +1144,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1194,14 +1176,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 113 flops */
         }
index e05b19a3c9a02197fbfe5356675c0e4957106453..583f1298bd85c5f9ecbebc4487dfcfb7d476edda 100644 (file)
@@ -241,6 +241,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -302,14 +306,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -342,14 +341,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -382,14 +376,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -422,14 +411,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 146 flops */
         }
@@ -498,6 +489,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -563,14 +558,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -607,14 +597,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -651,14 +636,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -695,14 +675,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 150 flops */
         }
@@ -934,6 +916,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -987,14 +973,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1021,14 +1002,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1055,14 +1031,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1089,14 +1060,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 135 flops */
         }
@@ -1165,6 +1138,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1221,14 +1198,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1258,14 +1230,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1295,14 +1262,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1332,14 +1294,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 139 flops */
         }
index 9fc4cc3609b15b9e302007ccd2590b8a0b623c49..9fbcad1c3c916a064dbe2e4cdb6b82700ff59831 100644 (file)
@@ -223,6 +223,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -268,14 +272,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -308,14 +307,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -348,14 +342,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 99 flops */
         }
@@ -419,6 +415,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -469,14 +469,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -513,14 +508,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -557,14 +547,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 102 flops */
         }
@@ -778,6 +770,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -813,14 +809,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -847,14 +838,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -881,14 +867,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 91 flops */
         }
@@ -952,6 +940,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -990,14 +982,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1027,14 +1014,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1064,14 +1046,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 94 flops */
         }
index 8889bf18101798c90c1cabe49fd494889436c31e..fad2c651810ef1a40e9a5361f6d19676e887c7c0 100644 (file)
@@ -234,6 +234,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -263,14 +267,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -303,14 +302,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -343,14 +337,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -383,14 +372,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 122 flops */
         }
@@ -459,6 +450,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -491,14 +486,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -535,14 +525,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -579,14 +564,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -623,14 +603,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 125 flops */
         }
@@ -855,6 +837,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -878,14 +864,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -912,14 +893,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -946,14 +922,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -980,14 +951,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 114 flops */
         }
@@ -1056,6 +1029,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1081,14 +1058,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1118,14 +1090,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1155,14 +1122,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1192,14 +1154,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
index b78ced94dffe1739acb8f3e4b2e294ae3bd8d448..263b26b7909eb22dfbf7d3c4b0631e5c23162261 100644 (file)
@@ -208,6 +208,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -239,14 +243,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -279,14 +278,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -319,14 +313,16 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 87 flops */
         }
@@ -386,6 +382,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -421,14 +421,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -465,14 +460,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -509,14 +499,16 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -715,6 +707,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -740,14 +736,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -774,14 +765,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -808,14 +794,16 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 84 flops */
         }
@@ -875,6 +863,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -903,14 +895,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -940,14 +927,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -977,14 +959,16 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 87 flops */
         }
index 500b5deab4e98f4afe086b0f255981f891c8a4a0..20e28a476380162260df5b9153d3fb394b8aaee4 100644 (file)
@@ -208,6 +208,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -239,14 +243,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -279,14 +278,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -319,14 +313,16 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 87 flops */
         }
@@ -386,6 +382,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -421,14 +421,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -465,14 +460,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -509,14 +499,16 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -715,6 +707,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -740,14 +736,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -774,14 +765,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -808,14 +794,16 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 84 flops */
         }
@@ -875,6 +863,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -903,14 +895,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -940,14 +927,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -977,14 +959,16 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 87 flops */
         }
index 8859134f1dcb06e0e53afee83b8c808c05c2711e..da83d09f844faab246cd88d5a7f75e87a8bc9151 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -302,14 +306,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -342,14 +341,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -382,16 +376,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 153 flops */
         }
@@ -455,6 +451,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -536,14 +536,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -579,14 +574,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -622,16 +612,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 154 flops */
         }
@@ -848,6 +840,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -912,14 +908,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -947,14 +938,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -982,16 +968,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -1055,6 +1043,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1122,14 +1114,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1159,14 +1146,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1196,16 +1178,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 127 flops */
         }
index 5efa7cb38730f4a2d020ece0f5784ecaacc52655..7efd9dc900c536c2534017a288f88dac356c3b9a 100644 (file)
@@ -237,6 +237,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -298,14 +302,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -336,14 +335,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -376,14 +370,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -416,16 +405,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 176 flops */
         }
@@ -494,6 +485,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -559,14 +554,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -600,14 +590,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -643,14 +628,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -686,16 +666,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 177 flops */
         }
@@ -923,6 +905,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -976,14 +962,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1009,14 +990,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1044,14 +1020,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1079,16 +1050,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 150 flops */
         }
@@ -1157,6 +1130,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1213,14 +1190,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1248,14 +1220,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1285,14 +1252,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1322,16 +1284,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 151 flops */
         }
index 4104954838dc7a14cb9ee964c2dd986eba9f4ece..9bfa0bbfcecdc608d3558ffa31fe9e402afb9914 100644 (file)
@@ -222,6 +222,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -267,14 +271,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -307,14 +306,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -347,16 +341,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 135 flops */
         }
@@ -420,6 +416,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -469,14 +469,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -512,14 +507,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -555,16 +545,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 135 flops */
         }
@@ -777,6 +769,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -811,14 +807,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -846,14 +837,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -881,16 +867,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 106 flops */
         }
@@ -954,6 +942,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -990,14 +982,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1027,14 +1014,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1064,16 +1046,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 106 flops */
         }
index c36c38ae22ef66cfcd8966da76cd3c6a4d6151bd..a72c812afab2f70254bacd5eba700bc1df0e2cf6 100644 (file)
@@ -233,6 +233,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -271,14 +275,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -311,14 +310,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -351,14 +345,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -391,16 +380,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 161 flops */
         }
@@ -469,6 +460,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -510,14 +505,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -553,14 +543,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -596,14 +581,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -639,16 +619,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 161 flops */
         }
@@ -872,6 +854,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -902,14 +888,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -937,14 +918,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -972,14 +948,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1007,16 +978,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 132 flops */
         }
@@ -1085,6 +1058,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1117,14 +1094,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1154,14 +1126,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1191,14 +1158,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1228,16 +1190,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 132 flops */
         }
index 4b7df7bf1f1940cabcdd412bc1fb8a3a345e1456..2b821a2d1bef57e08c144fb0c564c0a1d7cac3a1 100644 (file)
@@ -233,6 +233,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -290,14 +294,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -330,14 +329,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -370,16 +364,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 151 flops */
         }
@@ -443,6 +439,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -505,14 +505,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -548,14 +543,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -591,16 +581,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 152 flops */
         }
@@ -824,6 +816,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -873,14 +869,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -908,14 +899,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -943,16 +929,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 130 flops */
         }
@@ -1016,6 +1004,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1068,14 +1060,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1105,14 +1092,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1142,16 +1124,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 131 flops */
         }
index 696411fbc0b198dd922d238ae9a16f36cee84946..086836a33d0edc91e37d6733e6c8b47466ea50a9 100644 (file)
@@ -245,6 +245,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -295,14 +299,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -335,14 +334,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -375,14 +369,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -415,16 +404,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 179 flops */
         }
@@ -494,6 +485,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -548,14 +543,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -591,14 +581,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -634,14 +619,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -677,16 +657,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 180 flops */
         }
@@ -922,6 +904,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -967,14 +953,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1002,14 +983,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1037,14 +1013,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1072,16 +1043,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 158 flops */
         }
@@ -1151,6 +1124,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1199,14 +1176,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -1236,14 +1208,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -1273,14 +1240,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -1310,16 +1272,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 159 flops */
         }
index 2f93158f4c0b7f27f94e8174485749d733c96ef6..f24ce00422753df4f9f97691690e38683c623201 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -233,14 +237,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -273,14 +272,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -313,16 +307,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -382,6 +378,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -414,14 +414,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -457,14 +452,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -500,16 +490,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -704,6 +696,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -728,14 +724,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -763,14 +754,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -798,16 +784,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 99 flops */
         }
@@ -867,6 +855,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -893,14 +885,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             }
 
@@ -930,14 +917,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -967,16 +949,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 99 flops */
         }
index 486c9b41663a8b94059fa1dc0945c2ba0595898a..784ebcf8223d2ed35b8c35a79d76a942ee34eb9c 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -233,14 +237,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -273,14 +272,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -313,16 +307,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -382,6 +378,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -414,14 +414,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -457,14 +452,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -500,16 +490,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -704,6 +696,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -728,14 +724,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -763,14 +754,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -798,16 +784,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 99 flops */
         }
@@ -867,6 +855,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -893,14 +885,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             }
 
@@ -930,14 +917,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             }
 
@@ -967,16 +949,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
 
-            }
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 99 flops */
         }
index 77423a5762b9379a9dcbef8461925b758ea34c47..61285be3d56c821062848f27738d3b6ca913f4ef 100644 (file)
@@ -221,6 +221,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -288,14 +292,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -318,14 +317,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -348,14 +342,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 140 flops */
         }
@@ -419,6 +415,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -491,14 +491,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -524,14 +519,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -557,14 +547,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -776,6 +768,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -833,14 +829,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -859,14 +850,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -885,14 +871,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -956,6 +944,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1016,14 +1008,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1044,14 +1031,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1072,14 +1054,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 118 flops */
         }
index e0dea49a293a84b197517dac2d2ceed339a0e7e2..faa8834790157660569403bbf24eb8e255134272 100644 (file)
@@ -232,6 +232,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -293,14 +297,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -323,14 +322,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -353,14 +347,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -383,14 +372,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 164 flops */
         }
@@ -459,6 +450,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -524,14 +519,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -557,14 +547,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -590,14 +575,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -623,14 +603,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 165 flops */
         }
@@ -853,6 +835,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -906,14 +892,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -932,14 +913,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -958,14 +934,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -984,14 +955,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -1060,6 +1033,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1116,14 +1093,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1144,14 +1116,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1172,14 +1139,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1200,14 +1162,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 142 flops */
         }
index b030474b175bca49f7eab44ba03bb579f6b313e4..524d12845a016d2b2926c51aa7776a7972ab564f 100644 (file)
@@ -214,6 +214,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -249,14 +253,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -279,14 +278,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -309,14 +303,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -380,6 +376,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -419,14 +419,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -452,14 +447,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -485,14 +475,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -697,6 +689,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -724,14 +720,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -750,14 +741,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -776,14 +762,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 97 flops */
         }
@@ -847,6 +835,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -876,14 +868,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -904,14 +891,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -932,14 +914,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 97 flops */
         }
index 5f68a17ab179961dbee9ee8f5c85d2f9675092f2..3eb635aa9a3ff2fb2894b7969c70f23fafaefff0 100644 (file)
@@ -225,6 +225,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -254,14 +258,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -284,14 +283,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -314,14 +308,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -344,14 +333,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 140 flops */
         }
@@ -420,6 +411,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -452,14 +447,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -485,14 +475,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -518,14 +503,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -551,14 +531,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 140 flops */
         }
@@ -774,6 +756,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -797,14 +783,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -823,14 +804,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -849,14 +825,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -875,14 +846,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
@@ -951,6 +924,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -976,14 +953,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1004,14 +976,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1032,14 +999,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1060,14 +1022,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index 79b4e6f7ac7f72e5e9c259c389c2c0e693955440..2add1292f2a19bc03043e9a96fff8736e176945a 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -220,14 +224,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -250,14 +249,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -280,14 +274,16 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 105 flops */
         }
@@ -347,6 +343,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -371,14 +371,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -404,14 +399,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -437,14 +427,16 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 105 flops */
         }
@@ -634,6 +626,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -651,14 +647,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -677,14 +668,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -703,14 +689,16 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -770,6 +758,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -789,14 +781,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx00,fscal),
-                                                   _mm_mul_ps(dy00,fscal),
-                                                   _mm_mul_ps(dz00,fscal));
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -817,14 +804,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -845,14 +827,16 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
index 82e41f4809388b3970e98b1d368c71e2ba249f8b..74874ec66dc5c637ffa13c03ed6ba982f0bf4ff4 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -220,14 +224,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -250,14 +249,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -280,14 +274,16 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 105 flops */
         }
@@ -347,6 +343,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -371,14 +371,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -404,14 +399,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -437,14 +427,16 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 105 flops */
         }
@@ -634,6 +626,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -651,14 +647,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -677,14 +668,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -703,14 +689,16 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -770,6 +758,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -789,14 +781,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx10,fscal),
-                                                   _mm_mul_ps(dy10,fscal),
-                                                   _mm_mul_ps(dz10,fscal));
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -817,14 +804,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx20,fscal),
-                                                   _mm_mul_ps(dy20,fscal),
-                                                   _mm_mul_ps(dz20,fscal));
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -845,14 +827,16 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_avx_128_fma_single
             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
 
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
-                                                   _mm_mul_ps(dx30,fscal),
-                                                   _mm_mul_ps(dy30,fscal),
-                                                   _mm_mul_ps(dz30,fscal));
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
index 8a98b302000ca5aef5c1daeb784d9d907d48f555..24d7dcc3d5ec046917bd2bc2dbc049f6ce3fbd38 100644 (file)
@@ -463,7 +463,7 @@ void
             /*     #endfor */
             /* #endif */
 
-            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_J */
+            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
             /*     #for J in PARTICLES_J */
             fjx{J}             = _mm_setzero_ps();
             fjy{J}             = _mm_setzero_ps();
@@ -861,7 +861,7 @@ void
             fiz{I}             = _mm_macc_ps(dz{I}{J},fscal,fiz{I});
             /*             #define INNERFLOPS INNERFLOPS+6 */
 
-            /* #if GEOMETRY_J == 'Particle'             */
+            /* #if GEOMETRY_I == 'Particle'             */
             /*     #if ROUND == 'Loop' */
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
@@ -899,7 +899,7 @@ void
             /* #endfor */
             /* ## End of loop over i-j interaction pairs */
 
-            /* #if GEOMETRY_J != 'Particle' */
+            /* #if GEOMETRY_I != 'Particle' */
             /*     #if ROUND == 'Loop' */
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
@@ -913,7 +913,9 @@ void
             /*     #endif */
             /* #endif */
 
-            /* #if GEOMETRY_J == 'Water3'               */
+            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+            /* #elif GEOMETRY_J == 'Water3'               */
             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
             /*     #define INNERFLOPS INNERFLOPS+9      */
index 656538c11339ac641b438eed6c3b01194f3aad19..e0b324f582de5960fbdd5e4d624954808e9edde2 100644 (file)
@@ -109,10 +109,10 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 {
     __m128 t1,t2,t3,t4;
 
-    t1   = _mm_castpd_ps(_mm_load_sd((const double *)p1));
-    t2   = _mm_castpd_ps(_mm_load_sd((const double *)p2));
-    t3   = _mm_castpd_ps(_mm_load_sd((const double *)p3));
-    t4   = _mm_castpd_ps(_mm_load_sd((const double *)p4));
+    t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);   /* - - c12a  c6a */
+    t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);   /* - - c12b  c6b */
+    t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);   /* - - c12c  c6c */
+    t4   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p4);   /* - - c12d  c6d */
     t1   = _mm_unpacklo_ps(t1,t2);
     t2   = _mm_unpacklo_ps(t3,t4);
     *c6  = _mm_movelh_ps(t1,t2);
@@ -135,8 +135,8 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 {
     __m128 t1,t2,t3,t4;
     
-    t1   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
-    t2   = _mm_castpd_ps(_mm_load_sd((const double *)xyz));
+    t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
+    t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
     t3   = _mm_load_ss(xyz_shift+2);
     t4   = _mm_load_ss(xyz+2);
     t1   = _mm_add_ps(t1,t2);
@@ -158,7 +158,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
     
-    tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+    tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     tB   = _mm_load_ss(xyz_shift+2);
     
     t1   = _mm_loadu_ps(xyz);
@@ -543,44 +543,6 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
-static gmx_inline void
-gmx_mm_update_iforce_2atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
-                                      __m128 fix2, __m128 fiy2, __m128 fiz2,
-                                      float * gmx_restrict fptr,
-                                      float * gmx_restrict fshiftptr)
-{
-    __m128 t1,t2,t4;
-
-    /* transpose data */
-    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
-    t1 = _mm_unpacklo_ps(fiy2,fiz2);
-    t2 = _mm_unpackhi_ps(fiy2,fiz2);
-
-    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
-    t1   = _mm_add_ps(t1,t2);
-    t2   = _mm_movehl_ps(t2,t1);
-    fiy2 = _mm_add_ps(t1,t2);
-
-    _mm_storeu_ps(fptr,   _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-    t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
-    _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
-
-    t4 = _mm_load_ss(fshiftptr+2);
-    t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
-
-    t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2));   /* fiy2  -   fix2 fiz1 */
-    t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0));       /* fiy2 fix2  -   fiz1 */
-    t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1));   /* fiy1 fix1  -   fiz2 */
-
-    t1 = _mm_add_ps(t1,t2);
-    t1 = _mm_add_ps(t1,t4); /* y x - z */
-
-    _mm_store_ss(fshiftptr+2,t1);
-    _mm_storeh_pi((__m64 *)(fshiftptr),t1);
-}
-
-
-
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
index 142b7d33badef5db417faba82ae13cb1d867ce3b..778ea8f19916c1d41ee81681086dac2e0314ee41 100644 (file)
@@ -214,6 +214,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -293,11 +297,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -342,11 +344,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -391,12 +391,17 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 159 flops */
         }
 
@@ -455,6 +460,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -539,11 +548,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -592,11 +599,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -645,12 +650,17 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 162 flops */
         }
 
@@ -854,6 +864,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -922,11 +936,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -966,11 +978,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1010,12 +1020,17 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 139 flops */
         }
 
@@ -1074,6 +1089,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1145,11 +1164,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1192,11 +1209,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1239,12 +1254,17 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 142 flops */
         }
 
index 1dc5dda844a764fcaf1275d9381b8c5395956723..69011075120a69159704bac179d762bda55861db 100644 (file)
@@ -225,6 +225,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -289,11 +293,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -338,11 +340,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -387,11 +387,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -436,12 +434,17 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 185 flops */
         }
 
@@ -505,6 +508,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -573,11 +580,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -626,11 +631,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -679,11 +682,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -732,12 +733,17 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 189 flops */
         }
 
@@ -952,6 +958,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1008,11 +1018,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1052,11 +1060,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1096,11 +1102,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1140,12 +1144,17 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 165 flops */
         }
 
@@ -1209,6 +1218,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1268,11 +1281,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1315,11 +1326,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1362,11 +1371,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1409,12 +1416,17 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 169 flops */
         }
 
index a898fab9e199cba7ecc0c6bc8060857fbfb37f92..33ead0527907b752e9c5de783dc2f45c381d21e7 100644 (file)
@@ -216,6 +216,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -273,11 +277,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -322,11 +324,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -371,12 +371,17 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 142 flops */
         }
 
@@ -437,6 +442,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -499,11 +508,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -552,11 +559,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -605,12 +610,17 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 145 flops */
         }
 
@@ -816,6 +826,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -864,11 +878,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -908,11 +920,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -952,12 +962,17 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 125 flops */
         }
 
@@ -1018,6 +1033,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1069,11 +1088,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1116,11 +1133,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1163,12 +1178,17 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 128 flops */
         }
 
index faaa7234c1394742ba4d23e65cb12c40d3c816ac..4b688b915c4ac9040c7bf8ae9159bc3d0f06827a 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -260,11 +264,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -309,11 +311,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -358,11 +358,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -407,12 +405,17 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 161 flops */
         }
 
@@ -477,6 +480,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -514,11 +521,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -567,11 +572,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -620,11 +623,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -673,12 +674,17 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 164 flops */
         }
 
@@ -894,6 +900,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -922,11 +932,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -966,11 +974,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1010,11 +1016,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1054,12 +1058,17 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 144 flops */
         }
 
@@ -1124,6 +1133,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1154,11 +1167,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1201,11 +1212,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1248,11 +1257,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1295,12 +1302,17 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 147 flops */
         }
 
index 0d9ee8ef87e6f00bbaa60315bdb27a352642a52b..541592778a69431e65282973a44dc9c6c0f2f21f 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -242,11 +246,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -291,11 +293,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -340,12 +340,17 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 129 flops */
         }
 
@@ -400,6 +405,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -447,11 +456,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -500,11 +507,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -553,12 +558,17 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 132 flops */
         }
 
@@ -747,6 +757,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -785,11 +799,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -829,11 +841,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -873,12 +883,17 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 117 flops */
         }
 
@@ -933,6 +948,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -974,11 +993,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1021,11 +1038,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1068,12 +1083,17 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 120 flops */
         }
 
index fcc0f20db0f53e32e9ad5edbf3d10959c758f347..01d4d6a004ef09f8e2de16e8dd25a46e45695b5f 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -242,11 +246,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -291,11 +293,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -340,12 +340,17 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 129 flops */
         }
 
@@ -400,6 +405,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -447,11 +456,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -500,11 +507,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -553,12 +558,17 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 132 flops */
         }
 
@@ -747,6 +757,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -785,11 +799,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -829,11 +841,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -873,12 +883,17 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 117 flops */
         }
 
@@ -933,6 +948,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -974,11 +993,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1021,11 +1038,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1068,12 +1083,17 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 120 flops */
         }
 
index 0bd290a7d570eb09b3394e83d64c202f43b42690..9c98ce728ecf8b04e87700dc3d9df98f413e4fdf 100644 (file)
@@ -218,6 +218,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -287,11 +291,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -319,11 +321,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -351,12 +351,17 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 119 flops */
         }
 
@@ -419,6 +424,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -493,11 +502,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -528,11 +535,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -563,12 +568,17 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 120 flops */
         }
 
@@ -776,6 +786,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -836,11 +850,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -865,11 +877,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -894,12 +904,17 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
@@ -962,6 +977,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1025,11 +1044,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1056,11 +1073,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1087,12 +1102,17 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 109 flops */
         }
 
index e3c2f0180a9ed0959953c424049501fc1a9a5665..4977b64aecbc8572b1def11c52c3bcc2cbce58e2 100644 (file)
@@ -229,6 +229,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -292,11 +296,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -324,11 +326,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -356,11 +356,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -388,12 +386,17 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 140 flops */
         }
 
@@ -461,6 +464,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -528,11 +535,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -563,11 +568,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -598,11 +601,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -633,12 +634,17 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 141 flops */
         }
 
@@ -857,6 +863,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -912,11 +922,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -941,11 +949,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -970,11 +976,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -999,12 +1003,17 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 129 flops */
         }
 
@@ -1072,6 +1081,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1130,11 +1143,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1161,11 +1172,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1192,11 +1201,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1223,12 +1230,17 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 130 flops */
         }
 
index 45d6e375243cddd85203616507f1084ea8a8d7bc..c8d647153c0c79241443e8bf09c42b6f0645b3c4 100644 (file)
@@ -211,6 +211,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -251,11 +255,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -283,11 +285,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -315,12 +315,17 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 96 flops */
         }
 
@@ -383,6 +388,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -427,11 +436,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -462,11 +469,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -497,12 +502,17 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 96 flops */
         }
 
@@ -703,6 +713,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -736,11 +750,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -765,11 +777,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -794,12 +804,17 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 88 flops */
         }
 
@@ -862,6 +877,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -897,11 +916,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -928,11 +945,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -959,12 +974,17 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 88 flops */
         }
 
index 97fd246981478ed66bcb77d23978e8ebeb2a2786..5eddc72b284fbc5d3f2aaba7e507ca8a45d28bc4 100644 (file)
@@ -222,6 +222,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -256,11 +260,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -288,11 +290,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -320,11 +320,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -352,12 +350,17 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 116 flops */
         }
 
@@ -425,6 +428,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -462,11 +469,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -497,11 +502,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -532,11 +535,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -567,12 +568,17 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 116 flops */
         }
 
@@ -784,6 +790,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -812,11 +822,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -841,11 +849,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -870,11 +876,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -899,12 +903,17 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
@@ -972,6 +981,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1002,11 +1015,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1033,11 +1044,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1064,11 +1073,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1095,12 +1102,17 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
index 47108873330278f64d28918f8538d8b00ec80b09..d1499f64cdf31b200e4da8b4667b7cd9257e4b38 100644 (file)
@@ -196,6 +196,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -222,11 +226,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -254,11 +256,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -286,12 +286,17 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 84 flops */
         }
 
@@ -350,6 +355,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -379,11 +388,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -414,11 +421,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -449,12 +454,17 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 84 flops */
         }
 
@@ -640,6 +650,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -663,11 +677,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -692,11 +704,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -721,12 +731,17 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 81 flops */
         }
 
@@ -785,6 +800,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -810,11 +829,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -841,11 +858,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,12 +887,17 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 81 flops */
         }
 
index 19908b8e34375d840c0a77c9a68193158880c357..aec1f421a293f11e08671159a26689a73270d0a3 100644 (file)
@@ -196,6 +196,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -222,11 +226,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -254,11 +256,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -286,12 +286,17 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 84 flops */
         }
 
@@ -350,6 +355,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -379,11 +388,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -414,11 +421,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -449,12 +454,17 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 84 flops */
         }
 
@@ -640,6 +650,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -663,11 +677,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -692,11 +704,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -721,12 +731,17 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 81 flops */
         }
 
@@ -785,6 +800,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -810,11 +829,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -841,11 +858,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,12 +887,17 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 81 flops */
         }
 
index 2bc1eb9e5e8ede21a630b4d73e7c0a52200f28e2..19ba8e55ceecab0766558286d34e9a885494daef 100644 (file)
@@ -227,6 +227,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -292,11 +296,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -349,11 +351,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -406,13 +406,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 156 flops */
         }
@@ -476,6 +481,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -546,11 +555,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -607,11 +614,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -668,13 +673,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 159 flops */
         }
@@ -892,6 +902,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -942,11 +956,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -990,11 +1002,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1038,13 +1048,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 124 flops */
         }
@@ -1108,6 +1123,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1161,11 +1180,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1212,11 +1229,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1263,13 +1278,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 127 flops */
         }
index 55c1352cfb87e0d8766ab985c24212fc911ea39c..fe49ea2b61de264edbafc1212ec00510db30c502 100644 (file)
@@ -238,6 +238,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -281,11 +285,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -338,11 +340,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -395,11 +395,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -452,13 +450,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 179 flops */
         }
@@ -527,6 +530,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -573,11 +580,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -634,11 +639,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -695,11 +698,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -756,13 +757,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 182 flops */
         }
@@ -991,6 +997,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1026,11 +1036,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1074,11 +1082,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1122,11 +1128,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1170,13 +1174,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 147 flops */
         }
@@ -1245,6 +1254,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1282,11 +1295,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1333,11 +1344,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1384,11 +1393,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1435,13 +1442,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 150 flops */
         }
index c0376e264c90fde25647066279b2159f29208c9b..2d12809a0e7add9cf60937dd413ef264e84d2e8f 100644 (file)
@@ -209,6 +209,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -258,11 +262,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -315,11 +317,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -372,13 +372,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -438,6 +443,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -491,11 +500,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -552,11 +559,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -613,13 +618,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -819,6 +829,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -859,11 +873,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -907,11 +919,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -955,13 +965,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -1021,6 +1036,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1064,11 +1083,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1115,11 +1132,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1166,13 +1181,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index 9f44e07cb35ad421b02fc7a4628467f04a938be4..d4850b9c641b4568343737b083e0e8d85789afab 100644 (file)
@@ -209,6 +209,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -258,11 +262,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -315,11 +317,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -372,13 +372,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -438,6 +443,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -491,11 +500,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -552,11 +559,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -613,13 +618,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -819,6 +829,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -859,11 +873,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -907,11 +919,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -955,13 +965,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -1021,6 +1036,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1064,11 +1083,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1115,11 +1132,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1166,13 +1181,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index be212cfcb84e5c7516cbde52b264fda34eea65b3..124984f5b70e288998e079cdbc1da985b2268a0c 100644 (file)
@@ -238,6 +238,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -315,11 +319,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -383,11 +385,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -451,13 +451,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 213 flops */
         }
@@ -521,6 +526,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -603,11 +612,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -675,11 +682,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -747,13 +752,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 216 flops */
         }
@@ -982,6 +992,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1051,11 +1065,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1114,11 +1126,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1177,13 +1187,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 201 flops */
         }
@@ -1247,6 +1262,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1319,11 +1338,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1385,11 +1402,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1451,13 +1466,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 204 flops */
         }
index a292702a6758eb79794a6e6b97da75dfb4a5860e..ac4056d61aa211bf74584dc5a9d258ab86161c35 100644 (file)
@@ -250,6 +250,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -305,11 +309,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -373,11 +375,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -441,11 +441,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -509,13 +507,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 254 flops */
         }
@@ -585,6 +588,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -644,11 +651,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -716,11 +721,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -788,11 +791,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -860,13 +861,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 258 flops */
         }
@@ -1107,6 +1113,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1157,11 +1167,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1220,11 +1228,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1283,11 +1289,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1346,13 +1350,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 242 flops */
         }
@@ -1422,6 +1431,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1475,11 +1488,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1541,11 +1552,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1607,11 +1616,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1673,13 +1680,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 246 flops */
         }
index 95297cadb48b1bd2a21927a747b8d24cb2efdb0d..a0800c7bb1d72e93ad4fd4b36d5887c75a5b8bf0 100644 (file)
@@ -223,6 +223,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -283,11 +287,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -351,11 +353,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -419,13 +419,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 195 flops */
         }
@@ -485,6 +490,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -549,11 +558,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -621,11 +628,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -693,13 +698,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 198 flops */
         }
@@ -913,6 +923,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -968,11 +982,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1031,11 +1043,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1094,13 +1104,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 186 flops */
         }
@@ -1160,6 +1175,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1218,11 +1237,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1284,11 +1301,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1350,13 +1365,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 189 flops */
         }
index a011d13557c66464953b7f61fc3a66a15b4a7fb1..2440e40e80221af3ce632f57ae267fec8dac5539 100644 (file)
@@ -223,6 +223,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -283,11 +287,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -351,11 +353,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -419,13 +419,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 195 flops */
         }
@@ -485,6 +490,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -549,11 +558,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -621,11 +628,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -693,13 +698,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 198 flops */
         }
@@ -913,6 +923,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -968,11 +982,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1031,11 +1043,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1094,13 +1104,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 186 flops */
         }
@@ -1160,6 +1175,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1218,11 +1237,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1284,11 +1301,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1350,13 +1365,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 189 flops */
         }
index a295dadb83113393c99943c89ad1d3effaa8ff1c..ed609108936a97548bcc9f5e17abbaaca67b6c47 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -308,11 +312,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -355,11 +357,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -402,12 +402,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 157 flops */
         }
 
@@ -470,6 +475,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -557,11 +566,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -608,11 +615,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -659,12 +664,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 160 flops */
         }
 
@@ -880,6 +890,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -948,11 +962,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -987,11 +999,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1026,12 +1036,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 134 flops */
         }
 
@@ -1094,6 +1109,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1165,11 +1184,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1207,11 +1224,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1249,12 +1264,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 137 flops */
         }
 
index 5d8809b4d1b66000ad938e4eaa2d26bb115b8ddb..8de0251a0ceeb4fb362854d17e30a46ddfeb9720 100644 (file)
@@ -237,6 +237,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -300,11 +304,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -347,11 +349,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -394,11 +394,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -441,12 +439,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 179 flops */
         }
 
@@ -514,6 +517,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -581,11 +588,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -632,11 +637,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -683,11 +686,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -734,12 +735,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 183 flops */
         }
 
@@ -966,6 +972,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1021,11 +1031,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1060,11 +1068,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1099,11 +1105,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1138,12 +1142,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 156 flops */
         }
 
@@ -1211,6 +1220,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1269,11 +1282,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1311,11 +1322,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1353,11 +1362,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1395,12 +1402,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 160 flops */
         }
 
index ae3481909124348bbd651a4b2a4169192161bdab..e4c2fb57d85ce9082667498a76a47c7d11c739ba 100644 (file)
@@ -219,6 +219,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -274,11 +278,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -321,11 +323,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -368,12 +368,17 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 135 flops */
         }
 
@@ -436,6 +441,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -496,11 +505,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -547,11 +554,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -598,12 +603,17 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 138 flops */
         }
 
@@ -812,6 +822,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -855,11 +869,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -894,11 +906,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -933,12 +943,17 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 115 flops */
         }
 
@@ -1001,6 +1016,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1047,11 +1066,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1089,11 +1106,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1131,12 +1146,17 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 118 flops */
         }
 
index 450f3602e9fbcacec391313d6cc313c8f6b9a434..9d5ff02156eac44452009f90c69ba22953a6e89e 100644 (file)
@@ -230,6 +230,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -264,11 +268,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -311,11 +313,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -358,11 +358,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -405,12 +403,17 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 155 flops */
         }
 
@@ -478,6 +481,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -515,11 +522,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -566,11 +571,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -617,11 +620,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -668,12 +669,17 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 158 flops */
         }
 
@@ -893,6 +899,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -921,11 +931,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -960,11 +968,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -999,11 +1005,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1038,12 +1042,17 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 135 flops */
         }
 
@@ -1111,6 +1120,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1141,11 +1154,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1183,11 +1194,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1225,11 +1234,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1267,12 +1274,17 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 138 flops */
         }
 
index 69b01cda48ac0b341f8160e85ea10a08e9c3ff09..ff46929175e8b563530a0beb05c2295630550f6b 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -245,11 +249,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -292,11 +294,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -339,12 +339,17 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 123 flops */
         }
 
@@ -403,6 +408,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -448,11 +457,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -499,11 +506,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -550,12 +555,17 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 126 flops */
         }
 
@@ -749,6 +759,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -782,11 +796,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -821,11 +833,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -860,12 +870,17 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
@@ -924,6 +939,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -960,11 +979,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1002,11 +1019,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1044,12 +1059,17 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 111 flops */
         }
 
index 738eea937c2413dd0e90548ba2b1b2b4fb6fdfdc..9829f75d0363660f598e56bdcfa904b9653e84d3 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -245,11 +249,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -292,11 +294,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -339,12 +339,17 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 123 flops */
         }
 
@@ -403,6 +408,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -448,11 +457,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -499,11 +506,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -550,12 +555,17 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 126 flops */
         }
 
@@ -749,6 +759,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -782,11 +796,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -821,11 +833,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -860,12 +870,17 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
@@ -924,6 +939,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -960,11 +979,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1002,11 +1019,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1044,12 +1059,17 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 111 flops */
         }
 
index dbb5209b134896d4e6cbcf5f071c73eb0a7cdac6..ba86d93aa2f9b5c026b60884dfc9ccff074ff16a 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -304,11 +308,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -346,11 +348,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -388,13 +388,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 144 flops */
         }
@@ -458,6 +463,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -541,11 +550,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -586,11 +593,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -631,13 +636,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 145 flops */
         }
@@ -854,6 +864,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -920,11 +934,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -957,11 +969,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -994,13 +1004,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -1064,6 +1079,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1133,11 +1152,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1172,11 +1189,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1211,13 +1226,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 118 flops */
         }
index f5ac06080a08695c3543e73a86887f8fe755537d..235177ae40e8bd01cc581efd01ed49b39a19deeb 100644 (file)
@@ -237,6 +237,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -300,11 +304,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -340,11 +342,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -382,11 +382,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -424,13 +422,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 164 flops */
         }
@@ -499,6 +502,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -566,11 +573,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -609,11 +614,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -654,11 +657,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -699,13 +700,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 165 flops */
         }
@@ -933,6 +939,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -988,11 +998,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1023,11 +1031,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1060,11 +1066,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1097,13 +1101,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -1172,6 +1181,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1230,11 +1243,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1267,11 +1278,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1306,11 +1315,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1345,13 +1352,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 139 flops */
         }
index 2b376782b743e1b716a8dd521f47ea0fe8cc9e48..7eeab8da423baf33bb873d06fe2db95e87819024 100644 (file)
@@ -222,6 +222,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -272,11 +276,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -314,11 +316,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -356,13 +356,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -426,6 +431,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -480,11 +489,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -525,11 +532,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -570,13 +575,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -789,6 +799,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -828,11 +842,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -865,11 +877,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -902,13 +912,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 97 flops */
         }
@@ -972,6 +987,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1013,11 +1032,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1052,11 +1069,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1091,13 +1106,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 97 flops */
         }
index 8cd0d44077227a7c705485af88cfcd26f5509e07..9978c9542369f526fa2bdf0f49ccbc9562415509 100644 (file)
@@ -233,6 +233,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -276,11 +280,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -318,11 +320,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -360,11 +360,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -402,13 +400,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 149 flops */
         }
@@ -477,6 +480,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -523,11 +530,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -568,11 +573,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -613,11 +616,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -658,13 +659,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 149 flops */
         }
@@ -888,6 +894,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -923,11 +933,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -960,11 +968,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -997,11 +1003,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1034,13 +1038,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
@@ -1109,6 +1118,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1146,11 +1159,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1185,11 +1196,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1224,11 +1233,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1263,13 +1270,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index 455b3e8471f205388528f089f5815c4d3327ccf3..c42c54d902ca43596e95741db739b1732943a95d 100644 (file)
@@ -233,6 +233,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -295,11 +299,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -337,11 +339,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -379,13 +379,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 142 flops */
         }
@@ -449,6 +454,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -516,11 +525,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -561,11 +568,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -606,13 +611,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 143 flops */
         }
@@ -836,6 +846,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -890,11 +904,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -927,11 +939,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -964,13 +974,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 121 flops */
         }
@@ -1034,6 +1049,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1091,11 +1110,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1130,11 +1147,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1169,13 +1184,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 122 flops */
         }
index 55c0fb3c3201193abcb377347c3d6c66d89a0102..c8e877953458b98a54ae9301b74b16b58a0fff5a 100644 (file)
@@ -245,6 +245,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -300,11 +304,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -342,11 +344,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -384,11 +384,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -426,13 +424,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 167 flops */
         }
@@ -502,6 +505,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -561,11 +568,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -606,11 +611,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -651,11 +654,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -696,13 +697,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 168 flops */
         }
@@ -938,6 +944,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -988,11 +998,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1025,11 +1033,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1062,11 +1068,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1099,13 +1103,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 146 flops */
         }
@@ -1175,6 +1184,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1228,11 +1241,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1267,11 +1278,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1306,11 +1315,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -1345,13 +1352,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 147 flops */
         }
index 36661d0ff5a96d497d2db5e4244d90f05a8cdca1..cafd4d6ec8223fef226d274dfe0f66bb5068f968 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -238,11 +242,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -280,11 +282,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -322,13 +322,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -388,6 +393,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -425,11 +434,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -470,11 +477,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -515,13 +520,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -716,6 +726,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -745,11 +759,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -782,11 +794,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -819,13 +829,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -885,6 +900,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -916,11 +935,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -955,11 +972,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -994,13 +1009,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
index fff9022015bb0abfdfe526ba465f0d3a78172a86..5f9d4cede25732ee38db6c9e1726a71036c55101 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -238,11 +242,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -280,11 +282,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -322,13 +322,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -388,6 +393,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -425,11 +434,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -470,11 +477,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -515,13 +520,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -716,6 +726,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -745,11 +759,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -782,11 +794,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -819,13 +829,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -885,6 +900,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -916,11 +935,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -955,11 +972,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             }
 
@@ -994,13 +1009,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
-            }
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
index 73e1e954c6397e55178996403a6250e56bdd2657..fb1b69f1c92a066adb0726783418c96e6e9d16f6 100644 (file)
@@ -221,6 +221,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -290,11 +294,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -322,11 +324,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -354,12 +354,17 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 131 flops */
         }
 
@@ -422,6 +427,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -496,11 +505,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -531,11 +538,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -566,12 +571,17 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 132 flops */
         }
 
@@ -782,6 +792,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -841,11 +855,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -869,11 +881,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -897,12 +907,17 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
@@ -965,6 +980,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1027,11 +1046,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1057,11 +1074,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1087,12 +1102,17 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 109 flops */
         }
 
index 712d037f5edb37c7985f6dc38ac7000a57457112..9fd6ef13f776c969df4c532fd76d4386dbc00d98 100644 (file)
@@ -232,6 +232,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -295,11 +299,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -327,11 +329,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -359,11 +359,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -391,12 +389,17 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 152 flops */
         }
 
@@ -464,6 +467,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -531,11 +538,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -566,11 +571,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -601,11 +604,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -636,12 +637,17 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 153 flops */
         }
 
@@ -863,6 +869,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -918,11 +928,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -946,11 +954,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -974,11 +980,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1002,12 +1006,17 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 129 flops */
         }
 
@@ -1075,6 +1084,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1133,11 +1146,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1163,11 +1174,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1193,11 +1202,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1223,12 +1230,17 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 130 flops */
         }
 
index 6d9881df0591668d3e335d4e683336f0dcacfdd3..117311e1e2f4fcb7555aaaa75322c05f5338e94a 100644 (file)
@@ -214,6 +214,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -254,11 +258,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -286,11 +288,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -318,12 +318,17 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
@@ -386,6 +391,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -430,11 +439,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -465,11 +472,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -500,12 +505,17 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
@@ -709,6 +719,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -741,11 +755,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -769,11 +781,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -797,12 +807,17 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 88 flops */
         }
 
@@ -865,6 +880,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -899,11 +918,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -929,11 +946,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -959,12 +974,17 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 88 flops */
         }
 
index 6e43e0fafaaad7142268fbf478109ee68b434337..7af3ae5612b20566aa46c515f4c2791a5bf25f7e 100644 (file)
@@ -225,6 +225,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -259,11 +263,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -291,11 +293,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -323,11 +323,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -355,12 +353,17 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 128 flops */
         }
 
@@ -428,6 +431,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -465,11 +472,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -500,11 +505,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -535,11 +538,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -570,12 +571,17 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 128 flops */
         }
 
@@ -790,6 +796,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -818,11 +828,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -846,11 +854,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -874,11 +880,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -902,12 +906,17 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
@@ -975,6 +984,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1005,11 +1018,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1035,11 +1046,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1065,11 +1074,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1095,12 +1102,17 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 108 flops */
         }
 
index 036caf5d33233dcc7182f1d99248966f6eb164ac..15c99411d66734a04f04a4f9c1fb888d6fac1237 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -225,11 +229,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -257,11 +259,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -289,12 +289,17 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 96 flops */
         }
 
@@ -353,6 +358,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -382,11 +391,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -417,11 +424,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -452,12 +457,17 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 96 flops */
         }
 
@@ -646,6 +656,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -668,11 +682,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -696,11 +708,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -724,12 +734,17 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 81 flops */
         }
 
@@ -788,6 +803,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -812,11 +831,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -842,11 +859,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,12 +887,17 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 81 flops */
         }
 
index df2b5635dac4e90315c5eb81b927fbc8409cad20..3da293f0a1792858506cd0990745bc1ef6b47ed1 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -225,11 +229,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -257,11 +259,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -289,12 +289,17 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 96 flops */
         }
 
@@ -353,6 +358,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -382,11 +391,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -417,11 +424,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -452,12 +457,17 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 96 flops */
         }
 
@@ -646,6 +656,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -668,11 +682,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -696,11 +708,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -724,12 +734,17 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 81 flops */
         }
 
@@ -788,6 +803,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -812,11 +831,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -842,11 +859,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,12 +887,17 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 81 flops */
         }
 
index dc038346f8531bd21dca18a6c8c08b781486afed..edec7a374548afbae499dca7d10df43e3460f47f 100644 (file)
@@ -459,7 +459,7 @@ void
             /*     #endfor */
             /* #endif */
 
-            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_J */
+            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
             /*     #for J in PARTICLES_J */
             fjx{J}             = _mm_setzero_ps();
             fjy{J}             = _mm_setzero_ps();
@@ -871,7 +871,7 @@ void
             fiz{I}             = _mm_add_ps(fiz{I},tz);
             /*             #define INNERFLOPS INNERFLOPS+6 */
 
-            /* #if GEOMETRY_J == 'Particle'             */
+            /* #if GEOMETRY_I == 'Particle'             */
             /*     #if ROUND == 'Loop' */
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
@@ -906,7 +906,7 @@ void
             /* #endfor */
             /* ## End of loop over i-j interaction pairs */
 
-            /* #if GEOMETRY_J != 'Particle' */
+            /* #if GEOMETRY_I != 'Particle' */
             /*     #if ROUND == 'Loop' */
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
@@ -920,7 +920,9 @@ void
             /*     #endif */
             /* #endif */
 
-            /* #if GEOMETRY_J == 'Water3'               */
+            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+            /* #elif GEOMETRY_J == 'Water3'             */
             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
             /*     #define INNERFLOPS INNERFLOPS+9      */
index 3db14fb769fab38305e19a946011b4b185c53fd8..cee331911b3f87a7f27ab30f37e4f801a817b4f7 100644 (file)
@@ -109,10 +109,10 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
 {
     __m128 t1,t2,t3,t4;
     
-    t1   = _mm_castpd_ps(_mm_load_sd((const double *)p1));
-    t2   = _mm_castpd_ps(_mm_load_sd((const double *)p2));
-    t3   = _mm_castpd_ps(_mm_load_sd((const double *)p3));
-    t4   = _mm_castpd_ps(_mm_load_sd((const double *)p4));
+    t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);   /* - - c12a  c6a */
+    t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);   /* - - c12b  c6b */
+    t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);   /* - - c12c  c6c */
+    t4   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p4);   /* - - c12d  c6d */
     t1   = _mm_unpacklo_ps(t1,t2);
     t2   = _mm_unpacklo_ps(t3,t4);
     *c6  = _mm_movelh_ps(t1,t2);
@@ -129,8 +129,8 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
 {
     __m128 t1,t2,t3,t4;
     
-    t1   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
-    t2   = _mm_castpd_ps(_mm_load_sd((const double *)xyz));
+    t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
+    t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
     t3   = _mm_load_ss(xyz_shift+2);
     t4   = _mm_load_ss(xyz+2);
     t1   = _mm_add_ps(t1,t2);
@@ -152,7 +152,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
     __m128 tA,tB;
     __m128 t1,t2,t3,t4,t5,t6;
     
-    tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+    tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
     tB   = _mm_load_ss(xyz_shift+2);
 
     t1   = _mm_loadu_ps(xyz);
@@ -539,41 +539,6 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
        _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
 }
 
-static gmx_inline void
-gmx_mm_update_iforce_2atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
-                                      __m128 fix2, __m128 fiy2, __m128 fiz2,
-                                      float * gmx_restrict fptr,
-                                      float * gmx_restrict fshiftptr)
-{
-       __m128 t1,t2,t4;
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1);
-       fiz1 = _mm_hadd_ps(fiz1,fix2);
-       fiy2 = _mm_hadd_ps(fiy2,fiz2);
-       
-       fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
-       fiy2 = _mm_hadd_ps(fiy2,fiy2); /*  -    -   fiz2 fiy2 */
-    
-       _mm_storeu_ps(fptr,   _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-       t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(fptr+4));
-       _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
-       
-       t4 = _mm_load_ss(fshiftptr+2);
-       t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
-       
-       t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2));   /* fiy2  -   fix2 fiz1 */
-       t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0));       /* fiy2 fix2  -   fiz1 */
-       t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1));   /* fiy1 fix1  -   fiz2 */
-    
-       t1 = _mm_add_ps(t1,t2);
-       t1 = _mm_add_ps(t1,t4); /* y x - z */
-       
-       _mm_store_ss(fshiftptr+2,t1);
-       _mm_storeh_pi((__m64 *)(fshiftptr),t1);
-}
-
-
-
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
index 6613ea22da8bd69835617231aeb96d74c55ad824..4fb16235f972d326a2581f114b344a2bae6ba130 100644 (file)
@@ -214,6 +214,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -293,11 +297,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -342,11 +344,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -391,11 +391,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 159 flops */
         }
@@ -455,6 +460,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -539,11 +548,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -592,11 +599,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -645,11 +650,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 162 flops */
         }
@@ -854,6 +864,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -922,11 +936,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -966,11 +978,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1010,11 +1020,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 139 flops */
         }
@@ -1074,6 +1089,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1145,11 +1164,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1192,11 +1209,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1239,11 +1254,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 142 flops */
         }
index 0040f9372ceda7542db83915e91c5a43cbf8328f..8b16e0e082ddd8e185c998db5de91464a4c2be8e 100644 (file)
@@ -225,6 +225,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -289,11 +293,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -338,11 +340,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -387,11 +387,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -436,11 +434,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 185 flops */
         }
@@ -505,6 +508,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -573,11 +580,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -626,11 +631,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -679,11 +682,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -732,11 +733,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 189 flops */
         }
@@ -952,6 +958,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1008,11 +1018,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1052,11 +1060,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1096,11 +1102,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1140,11 +1144,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 165 flops */
         }
@@ -1209,6 +1218,10 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1268,11 +1281,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1315,11 +1326,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1362,11 +1371,9 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1409,11 +1416,16 @@ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 169 flops */
         }
index 9f8243447f32606f715c94c88003b0f83b7ccb05..ffbe4eccc6ae598a3b90d5ba9fc9a19f677db873 100644 (file)
@@ -216,6 +216,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -273,11 +277,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -322,11 +324,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -371,11 +371,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 142 flops */
         }
@@ -437,6 +442,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -499,11 +508,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -552,11 +559,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -605,11 +610,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 145 flops */
         }
@@ -816,6 +826,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -864,11 +878,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -908,11 +920,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -952,11 +962,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 125 flops */
         }
@@ -1018,6 +1033,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1069,11 +1088,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1116,11 +1133,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1163,11 +1178,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 128 flops */
         }
index c1f9df6113ea0d643536b2b5e5bc7ad5dfb5f546..892a7cd0d03af705455d07558a56bc90f2d18a3f 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -260,11 +264,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -309,11 +311,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -358,11 +358,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -407,11 +405,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 161 flops */
         }
@@ -477,6 +480,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -514,11 +521,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -567,11 +572,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -620,11 +623,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -673,11 +674,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 164 flops */
         }
@@ -894,6 +900,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -922,11 +932,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -966,11 +974,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1010,11 +1016,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1054,11 +1058,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 144 flops */
         }
@@ -1124,6 +1133,10 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1154,11 +1167,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1201,11 +1212,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1248,11 +1257,9 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1295,11 +1302,16 @@ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 147 flops */
         }
index a8949502422d2f1bfd2347f5a530b9fd91cd1eee..8c057cf54a29b9831aca4ebe9e0cec33dd7851f4 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -242,11 +246,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -291,11 +293,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -340,11 +340,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 129 flops */
         }
@@ -400,6 +405,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -447,11 +456,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -500,11 +507,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -553,11 +558,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 132 flops */
         }
@@ -747,6 +757,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -785,11 +799,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -829,11 +841,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -873,11 +883,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -933,6 +948,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -974,11 +993,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1021,11 +1038,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1068,11 +1083,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index 6f98892dd188c91d843fcbb4f93beda00bf1ec10..dd01c35782e5656c55f7e72d51ad323fefbda265 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -242,11 +246,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -291,11 +293,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -340,11 +340,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 129 flops */
         }
@@ -400,6 +405,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -447,11 +456,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -500,11 +507,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -553,11 +558,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 132 flops */
         }
@@ -747,6 +757,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -785,11 +799,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -829,11 +841,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -873,11 +883,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -933,6 +948,10 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -974,11 +993,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1021,11 +1038,9 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1068,11 +1083,16 @@ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index 711ff0c3726ef638be6e607b55b1b61adb06a8d6..d325eb1415e80db0a8dd784bcd003d920af0e0b2 100644 (file)
@@ -218,6 +218,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -287,11 +291,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -319,11 +321,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -351,11 +351,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 119 flops */
         }
@@ -419,6 +424,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -493,11 +502,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -528,11 +535,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -563,11 +568,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
@@ -776,6 +786,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -836,11 +850,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -865,11 +877,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -894,11 +904,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -962,6 +977,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1025,11 +1044,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1056,11 +1073,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1087,11 +1102,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 109 flops */
         }
index 5ae4224749fc4e990a76f002d178d63f6b394589..357e868af4c7897e61b1cc461c9b00094ac57d69 100644 (file)
@@ -229,6 +229,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -292,11 +296,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -324,11 +326,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -356,11 +356,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -388,11 +386,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 140 flops */
         }
@@ -461,6 +464,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -528,11 +535,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -563,11 +568,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -598,11 +601,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -633,11 +634,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -857,6 +863,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -912,11 +922,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -941,11 +949,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -970,11 +976,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -999,11 +1003,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 129 flops */
         }
@@ -1072,6 +1081,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1130,11 +1143,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1161,11 +1172,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1192,11 +1201,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1223,11 +1230,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 130 flops */
         }
index 27096959b21c0a312c0a97513093ff7e92b923b4..9d742cb5924bbac3a4f23783ec2bedc929a610ae 100644 (file)
@@ -211,6 +211,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -251,11 +255,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -283,11 +285,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -315,11 +315,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 96 flops */
         }
@@ -383,6 +388,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -427,11 +436,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -462,11 +469,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -497,11 +502,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 96 flops */
         }
@@ -703,6 +713,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -736,11 +750,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -765,11 +777,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -794,11 +804,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 88 flops */
         }
@@ -862,6 +877,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -897,11 +916,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -928,11 +945,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -959,11 +974,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 88 flops */
         }
index 54c53e5c88f7ef43435cc22c5a606f01b72356be..587b07dc118dc855f559b4a1174f725c96351b41 100644 (file)
@@ -222,6 +222,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -256,11 +260,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -288,11 +290,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -320,11 +320,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -352,11 +350,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 116 flops */
         }
@@ -425,6 +428,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -462,11 +469,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -497,11 +502,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -532,11 +535,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -567,11 +568,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 116 flops */
         }
@@ -784,6 +790,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -812,11 +822,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -841,11 +849,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -870,11 +876,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -899,11 +903,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -972,6 +981,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1002,11 +1015,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1033,11 +1044,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1064,11 +1073,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1095,11 +1102,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
index dd804f5f6493d1863c11cca1f2681f39d2bbaab2..ff39c155b88da3428fa4c32dfe2aecd956ba0404 100644 (file)
@@ -196,6 +196,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -222,11 +226,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -254,11 +256,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -286,11 +286,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 84 flops */
         }
@@ -350,6 +355,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -379,11 +388,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -414,11 +421,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -449,11 +454,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 84 flops */
         }
@@ -640,6 +650,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -663,11 +677,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -692,11 +704,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -721,11 +731,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
@@ -785,6 +800,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -810,11 +829,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -841,11 +858,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,11 +887,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
index 71a06791a00743919d0f34a204fed298cce436aa..d12786cf32cc4d094fcd448ce9a5c5f871b48da6 100644 (file)
@@ -196,6 +196,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -222,11 +226,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -254,11 +256,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -286,11 +286,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 84 flops */
         }
@@ -350,6 +355,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -379,11 +388,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -414,11 +421,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -449,11 +454,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 84 flops */
         }
@@ -640,6 +650,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -663,11 +677,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -692,11 +704,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -721,11 +731,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
@@ -785,6 +800,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -810,11 +829,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -841,11 +858,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,11 +887,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
index 8f91d095fd7eaf049e86ee41c684c4cc4f19bf13..c50f4a97ddccbd1be4bbd0cc582ba918f7e35079 100644 (file)
@@ -227,6 +227,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -292,11 +296,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -349,11 +351,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -406,13 +406,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 156 flops */
         }
@@ -476,6 +481,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -546,11 +555,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -607,11 +614,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -668,13 +673,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 159 flops */
         }
@@ -892,6 +902,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -942,11 +956,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -990,11 +1002,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1038,13 +1048,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 124 flops */
         }
@@ -1108,6 +1123,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1161,11 +1180,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1212,11 +1229,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1263,13 +1278,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 127 flops */
         }
index 9f64e28819812f195ddf347a396749aab3f52ef1..e9b41e46d55eb081a4d64bff860786cc212bffd1 100644 (file)
@@ -238,6 +238,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -281,11 +285,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -338,11 +340,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -395,11 +395,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -452,13 +450,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 179 flops */
         }
@@ -527,6 +530,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -573,11 +580,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -634,11 +639,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -695,11 +698,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -756,13 +757,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 182 flops */
         }
@@ -991,6 +997,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1026,11 +1036,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1074,11 +1082,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1122,11 +1128,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1170,13 +1174,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 147 flops */
         }
@@ -1245,6 +1254,10 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1282,11 +1295,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1333,11 +1344,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1384,11 +1393,9 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1435,13 +1442,18 @@ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 150 flops */
         }
index 36671aea4a642a7c5644aea1dd91a3e6140810b6..114ed8b8c03f31d8cec0e5daeb527e1914f348f5 100644 (file)
@@ -209,6 +209,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -258,11 +262,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -315,11 +317,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -372,13 +372,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -438,6 +443,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -491,11 +500,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -552,11 +559,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -613,13 +618,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -819,6 +829,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -859,11 +873,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -907,11 +919,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -955,13 +965,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -1021,6 +1036,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1064,11 +1083,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1115,11 +1132,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1166,13 +1181,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index fb7728ef613cc61b1de381aac53ffb73e42f679b..2faee8a4824235d2643b837dc28ed129a95e2a56 100644 (file)
@@ -209,6 +209,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -258,11 +262,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -315,11 +317,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -372,13 +372,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -438,6 +443,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -491,11 +500,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -552,11 +559,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -613,13 +618,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 141 flops */
         }
@@ -819,6 +829,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -859,11 +873,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -907,11 +919,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -955,13 +965,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -1021,6 +1036,10 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1064,11 +1083,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1115,11 +1132,9 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1166,13 +1181,18 @@ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index 72ee58642d08a634a0dea19e891796685899c264..130e3198a34f5a2030101609c916e0d391524475 100644 (file)
@@ -238,6 +238,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -315,11 +319,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -383,11 +385,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -451,13 +451,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 213 flops */
         }
@@ -521,6 +526,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -603,11 +612,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -675,11 +682,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -747,13 +752,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 216 flops */
         }
@@ -982,6 +992,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1051,11 +1065,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1114,11 +1126,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1177,13 +1187,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 201 flops */
         }
@@ -1247,6 +1262,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1319,11 +1338,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1385,11 +1402,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1451,13 +1466,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 204 flops */
         }
index 8391a116050c1aee71308d79e9f7c1b1cf024144..c4d362a32d59aa76e2995e558dbb4cf2a6a60268 100644 (file)
@@ -250,6 +250,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -305,11 +309,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -373,11 +375,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -441,11 +441,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -509,13 +507,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 254 flops */
         }
@@ -585,6 +588,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -644,11 +651,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -716,11 +721,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -788,11 +791,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -860,13 +861,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 258 flops */
         }
@@ -1107,6 +1113,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1157,11 +1167,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1220,11 +1228,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1283,11 +1289,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1346,13 +1350,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 242 flops */
         }
@@ -1422,6 +1431,10 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1475,11 +1488,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1541,11 +1552,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1607,11 +1616,9 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1673,13 +1680,18 @@ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 246 flops */
         }
index b2071b263661e711f052fd9febca42171305d3c3..4ca1cf83c2c692a78d64b25fd2210af420a42ebc 100644 (file)
@@ -223,6 +223,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -283,11 +287,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -351,11 +353,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -419,13 +419,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 195 flops */
         }
@@ -485,6 +490,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -549,11 +558,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -621,11 +628,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -693,13 +698,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 198 flops */
         }
@@ -913,6 +923,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -968,11 +982,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1031,11 +1043,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1094,13 +1104,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 186 flops */
         }
@@ -1160,6 +1175,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1218,11 +1237,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1284,11 +1301,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1350,13 +1365,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 189 flops */
         }
index a2476d3070bac347a7007a1ed3dc0bc36d80b20a..0e492764e7077933c210a5da1402a60e0c1d14a6 100644 (file)
@@ -223,6 +223,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -283,11 +287,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -351,11 +353,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -419,13 +419,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 195 flops */
         }
@@ -485,6 +490,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -549,11 +558,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -621,11 +628,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -693,13 +698,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 198 flops */
         }
@@ -913,6 +923,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -968,11 +982,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1031,11 +1043,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1094,13 +1104,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 186 flops */
         }
@@ -1160,6 +1175,10 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1218,11 +1237,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1284,11 +1301,9 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1350,13 +1365,18 @@ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 189 flops */
         }
index 591dac2e7113e1e72b40dccd6642bc9da169ac95..4d5bcd08900cb0ed9415b838c911c5738018724c 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -308,11 +312,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -355,11 +357,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -402,11 +402,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 157 flops */
         }
@@ -470,6 +475,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -557,11 +566,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -608,11 +615,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -659,11 +664,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 160 flops */
         }
@@ -880,6 +890,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -948,11 +962,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -987,11 +999,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1026,11 +1036,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 134 flops */
         }
@@ -1094,6 +1109,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1165,11 +1184,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1207,11 +1224,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1249,11 +1264,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 137 flops */
         }
index 6fa12de1468cabab97544fabf7ccd85fc36e78ae..61e0cfc6f4f61edd636608d36692c6d7807131c5 100644 (file)
@@ -237,6 +237,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -300,11 +304,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -347,11 +349,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -394,11 +394,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -441,11 +439,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 179 flops */
         }
@@ -514,6 +517,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -581,11 +588,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -632,11 +637,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -683,11 +686,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -734,11 +735,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 183 flops */
         }
@@ -966,6 +972,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1021,11 +1031,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1060,11 +1068,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1099,11 +1105,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1138,11 +1142,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 156 flops */
         }
@@ -1211,6 +1220,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1269,11 +1282,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1311,11 +1322,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1353,11 +1362,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1395,11 +1402,16 @@ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 160 flops */
         }
index 89ee5c19b0ced0be366433ed582c02bd33002e27..eff4f026bb8b8eb77dde8e8f18ecc2df5394f920 100644 (file)
@@ -219,6 +219,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -274,11 +278,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -321,11 +323,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -368,11 +368,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 135 flops */
         }
@@ -436,6 +441,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -496,11 +505,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -547,11 +554,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -598,11 +603,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -812,6 +822,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -855,11 +869,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -894,11 +906,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -933,11 +943,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 115 flops */
         }
@@ -1001,6 +1016,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1047,11 +1066,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1089,11 +1106,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1131,11 +1146,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 118 flops */
         }
index 424291fd0ed571166faec2faae5163da08a584d9..16fefb40f3b39ba1644c7da17f71dc2e476d6426 100644 (file)
@@ -230,6 +230,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -264,11 +268,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -311,11 +313,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -358,11 +358,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -405,11 +403,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 155 flops */
         }
@@ -478,6 +481,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -515,11 +522,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -566,11 +571,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -617,11 +620,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -668,11 +669,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 158 flops */
         }
@@ -893,6 +899,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -921,11 +931,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -960,11 +968,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -999,11 +1005,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1038,11 +1042,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 135 flops */
         }
@@ -1111,6 +1120,10 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1141,11 +1154,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1183,11 +1194,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1225,11 +1234,9 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1267,11 +1274,16 @@ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
index 323a6669168ddebdad2c5bea47dd0040a2032b4e..cb0d236ef4c2822f91c657d917ee3c6b65259ec7 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -245,11 +249,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -292,11 +294,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -339,11 +339,16 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 123 flops */
         }
@@ -403,6 +408,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -448,11 +457,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -499,11 +506,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -550,11 +555,16 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -749,6 +759,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -782,11 +796,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -821,11 +833,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -860,11 +870,16 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -924,6 +939,10 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -960,11 +979,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1002,11 +1019,9 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1044,11 +1059,16 @@ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 111 flops */
         }
index 0d66c29534eab135ff2807491e76727513a4ae6e..80070b268834a5be826f14bbae088dc567c26824 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -245,11 +249,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -292,11 +294,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -339,11 +339,16 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 123 flops */
         }
@@ -403,6 +408,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -448,11 +457,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -499,11 +506,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -550,11 +555,16 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -749,6 +759,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -782,11 +796,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -821,11 +833,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -860,11 +870,16 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -924,6 +939,10 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -960,11 +979,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1002,11 +1019,9 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1044,11 +1059,16 @@ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 111 flops */
         }
index e652a4af88c31b9b13034bc03393e8f47a483100..d452c7e382972836de74e965b948e63e3f84e76b 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -304,11 +308,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -346,11 +348,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -388,13 +388,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 144 flops */
         }
@@ -458,6 +463,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -541,11 +550,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -586,11 +593,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -631,13 +636,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 145 flops */
         }
@@ -854,6 +864,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -920,11 +934,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -957,11 +969,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -994,13 +1004,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 117 flops */
         }
@@ -1064,6 +1079,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1133,11 +1152,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1172,11 +1189,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1211,13 +1226,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 118 flops */
         }
index 51ed6a5a76d18cde3f9b08e120a8d22466aa58e0..551c278374ea637e8c55f53118c947302894b3b1 100644 (file)
@@ -237,6 +237,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -300,11 +304,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -340,11 +342,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -382,11 +382,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -424,13 +422,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 164 flops */
         }
@@ -499,6 +502,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -566,11 +573,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -609,11 +614,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -654,11 +657,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -699,13 +700,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 165 flops */
         }
@@ -933,6 +939,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -988,11 +998,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1023,11 +1031,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1060,11 +1066,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1097,13 +1101,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 138 flops */
         }
@@ -1172,6 +1181,10 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1230,11 +1243,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1267,11 +1278,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1306,11 +1315,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1345,13 +1352,18 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 139 flops */
         }
index 30cc035f7b9619c72bad94007fabe7649a1a72c7..3077f468c9cf4627f02b44180e0b3c5f4a1a9891 100644 (file)
@@ -222,6 +222,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -272,11 +276,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -314,11 +316,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -356,13 +356,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -426,6 +431,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -480,11 +489,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -525,11 +532,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -570,13 +575,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 126 flops */
         }
@@ -789,6 +799,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -828,11 +842,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -865,11 +877,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -902,13 +912,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 97 flops */
         }
@@ -972,6 +987,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1013,11 +1032,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1052,11 +1069,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1091,13 +1106,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 97 flops */
         }
index 999eaf26a2f067a73f6ce624b5421c9eb26fc300..edb3898a1117d5c5b90a8048d9bec234cc3f6952 100644 (file)
@@ -233,6 +233,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -276,11 +280,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -318,11 +320,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -360,11 +360,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -402,13 +400,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 149 flops */
         }
@@ -477,6 +480,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -523,11 +530,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -568,11 +573,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -613,11 +616,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -658,13 +659,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 149 flops */
         }
@@ -888,6 +894,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -923,11 +933,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -960,11 +968,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -997,11 +1003,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1034,13 +1038,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
@@ -1109,6 +1118,10 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1146,11 +1159,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1185,11 +1196,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1224,11 +1233,9 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1263,13 +1270,18 @@ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
index e1879589acb5a7fc64ff9e2bb02a236a63252de0..19aa4f12dc7a1c3f77c63f04e19688661473cfa5 100644 (file)
@@ -233,6 +233,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -295,11 +299,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -337,11 +339,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -379,13 +379,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 142 flops */
         }
@@ -449,6 +454,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -516,11 +525,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -561,11 +568,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -606,13 +611,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 143 flops */
         }
@@ -836,6 +846,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -890,11 +904,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -927,11 +939,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -964,13 +974,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 121 flops */
         }
@@ -1034,6 +1049,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1091,11 +1110,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1130,11 +1147,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1169,13 +1184,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 122 flops */
         }
index e0d9f95d1dd39b21b2d3f3c2dcc32d7286a17d32..6e45c3cd2ada4f311efc7f35d42cc60ae454ac41 100644 (file)
@@ -245,6 +245,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -300,11 +304,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -342,11 +344,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -384,11 +384,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -426,13 +424,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 167 flops */
         }
@@ -502,6 +505,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -561,11 +568,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -606,11 +611,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -651,11 +654,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -696,13 +697,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 168 flops */
         }
@@ -938,6 +944,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -988,11 +998,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1025,11 +1033,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1062,11 +1068,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1099,13 +1103,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 146 flops */
         }
@@ -1175,6 +1184,10 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1228,11 +1241,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1267,11 +1278,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1306,11 +1315,9 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -1345,13 +1352,18 @@ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 147 flops */
         }
index b036025436cb31c84b43af2efe5c49e91e768436..65ae66dfe95453a91065144eb0f686a76d7df7b7 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -238,11 +242,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -280,11 +282,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -322,13 +322,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -388,6 +393,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -425,11 +434,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -470,11 +477,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -515,13 +520,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -716,6 +726,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -745,11 +759,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -782,11 +794,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -819,13 +829,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -885,6 +900,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -916,11 +935,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -955,11 +972,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -994,13 +1009,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
index f5fe0bb106f2668d82737d9b6d634ff2b7b820a3..b94c8bea6e3fa765824005c80871017f31bc568f 100644 (file)
@@ -204,6 +204,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -238,11 +242,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -280,11 +282,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -322,13 +322,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -388,6 +393,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -425,11 +434,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -470,11 +477,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -515,13 +520,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -716,6 +726,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -745,11 +759,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -782,11 +794,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -819,13 +829,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
@@ -885,6 +900,10 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -916,11 +935,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -955,11 +972,9 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             }
 
@@ -994,13 +1009,18 @@ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
 
-            }
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 90 flops */
         }
index 5c464f54707f91f243a615107f60d353aa33ec7c..32c7feb202183d324c70e950ebe1cf4979968d32 100644 (file)
@@ -221,6 +221,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -290,11 +294,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -322,11 +324,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -354,11 +354,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 131 flops */
         }
@@ -422,6 +427,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -496,11 +505,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -531,11 +538,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -566,11 +571,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 132 flops */
         }
@@ -782,6 +792,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -841,11 +855,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -869,11 +881,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -897,11 +907,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -965,6 +980,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1027,11 +1046,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1057,11 +1074,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1087,11 +1102,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 109 flops */
         }
index 6dc82034ca7d21995a7b547dc3ffcb8d6ee2f74e..0c2f12fc4b2679d987ed6e7c288965fc6ba302c8 100644 (file)
@@ -232,6 +232,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -295,11 +299,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -327,11 +329,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -359,11 +359,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -391,11 +389,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 152 flops */
         }
@@ -464,6 +467,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -531,11 +538,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -566,11 +571,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -601,11 +604,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -636,11 +637,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 153 flops */
         }
@@ -863,6 +869,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -918,11 +928,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -946,11 +954,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -974,11 +980,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1002,11 +1006,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 129 flops */
         }
@@ -1075,6 +1084,10 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1133,11 +1146,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1163,11 +1174,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1193,11 +1202,9 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1223,11 +1230,16 @@ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 130 flops */
         }
index 499a65956bbf384f6ef576275e5d1c08356bec51..8e04a9d73684ad0e470dc4b6d89f17060c9a2b1e 100644 (file)
@@ -214,6 +214,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -254,11 +258,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -286,11 +288,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -318,11 +318,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -386,6 +391,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -430,11 +439,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -465,11 +472,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -500,11 +505,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -709,6 +719,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -741,11 +755,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -769,11 +781,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -797,11 +807,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 88 flops */
         }
@@ -865,6 +880,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -899,11 +918,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -929,11 +946,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -959,11 +974,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 88 flops */
         }
index 211579c83efa9dc2d7085b4a1e8a7e0a9f634c96..73c35b4696dccbbea2c4d834f872741b89715d46 100644 (file)
@@ -225,6 +225,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -259,11 +263,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -291,11 +293,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -323,11 +323,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -355,11 +353,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 128 flops */
         }
@@ -428,6 +431,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -465,11 +472,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -500,11 +505,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -535,11 +538,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -570,11 +571,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 128 flops */
         }
@@ -790,6 +796,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -818,11 +828,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -846,11 +854,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -874,11 +880,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -902,11 +906,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -975,6 +984,10 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1005,11 +1018,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1035,11 +1046,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1065,11 +1074,9 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1095,11 +1102,16 @@ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
index 99e4b0c7ab049b24c872ff7e031526e870a50d17..13d448eae1783749d7ecadf708d05b2f722115f7 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -225,11 +229,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -257,11 +259,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -289,11 +289,16 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 96 flops */
         }
@@ -353,6 +358,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -382,11 +391,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -417,11 +424,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -452,11 +457,16 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 96 flops */
         }
@@ -646,6 +656,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -668,11 +682,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -696,11 +708,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -724,11 +734,16 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
@@ -788,6 +803,10 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -812,11 +831,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -842,11 +859,9 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,11 +887,16 @@ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
index 69c2ca0851a3ea4400a62d19a70b71891db70c11..41beb98b8949ba31d5221c8acbee5100fe061e4f 100644 (file)
@@ -199,6 +199,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -225,11 +229,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -257,11 +259,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -289,11 +289,16 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 96 flops */
         }
@@ -353,6 +358,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -382,11 +391,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -417,11 +424,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -452,11 +457,16 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 96 flops */
         }
@@ -646,6 +656,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -668,11 +682,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -696,11 +708,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -724,11 +734,16 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
@@ -788,6 +803,10 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -812,11 +831,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -842,11 +859,9 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,11 +887,16 @@ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
index 1733db7272416237ca075d8158dad3eafd2b7f2f..9214f45ad8157e365ea8f91bdd1e2c72b6bf9ebe 100644 (file)
@@ -459,7 +459,7 @@ void
             /*     #endfor */
             /* #endif */
 
-            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_J */
+            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
             /*     #for J in PARTICLES_J */
             fjx{J}             = _mm_setzero_ps();
             fjy{J}             = _mm_setzero_ps();
@@ -870,7 +870,7 @@ void
             fiz{I}             = _mm_add_ps(fiz{I},tz);
             /*             #define INNERFLOPS INNERFLOPS+6 */
 
-            /* #if GEOMETRY_J == 'Particle'             */
+            /* #if GEOMETRY_I == 'Particle'             */
             /*     #if ROUND == 'Loop' */
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
@@ -905,7 +905,7 @@ void
             /* #endfor */
             /* ## End of loop over i-j interaction pairs */
 
-            /* #if GEOMETRY_J != 'Particle' */
+            /* #if GEOMETRY_I != 'Particle' */
             /*     #if ROUND == 'Loop' */
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
@@ -919,7 +919,9 @@ void
             /*     #endif */
             /* #endif */
 
-            /* #if GEOMETRY_J == 'Water3'               */
+           /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+            /* #elif GEOMETRY_J == 'Water3'               */
             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
             /*     #define INNERFLOPS INNERFLOPS+9      */