Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            None
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset1;
70     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71     int              vdwioffset2;
72     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73     int              vdwioffset3;
74     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
75     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
76     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
77     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
78     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
79     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
80     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
81     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
82     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
83     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
84     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
85     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
86     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
87     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
88     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
89     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     __m128i          ewitab;
93     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
94     real             *ewtab;
95     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
96     real             rswitch_scalar,d_scalar;
97     __m128           dummy_mask,cutoff_mask;
98     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
99     __m128           one     = _mm_set1_ps(1.0);
100     __m128           two     = _mm_set1_ps(2.0);
101     x                = xx[0];
102     f                = ff[0];
103
104     nri              = nlist->nri;
105     iinr             = nlist->iinr;
106     jindex           = nlist->jindex;
107     jjnr             = nlist->jjnr;
108     shiftidx         = nlist->shift;
109     gid              = nlist->gid;
110     shiftvec         = fr->shift_vec[0];
111     fshift           = fr->fshift[0];
112     facel            = _mm_set1_ps(fr->epsfac);
113     charge           = mdatoms->chargeA;
114
115     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
116     ewtab            = fr->ic->tabq_coul_FDV0;
117     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
118     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
119
120     /* Setup water-specific parameters */
121     inr              = nlist->iinr[0];
122     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
123     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
124     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
125
126     jq1              = _mm_set1_ps(charge[inr+1]);
127     jq2              = _mm_set1_ps(charge[inr+2]);
128     jq3              = _mm_set1_ps(charge[inr+3]);
129     qq11             = _mm_mul_ps(iq1,jq1);
130     qq12             = _mm_mul_ps(iq1,jq2);
131     qq13             = _mm_mul_ps(iq1,jq3);
132     qq21             = _mm_mul_ps(iq2,jq1);
133     qq22             = _mm_mul_ps(iq2,jq2);
134     qq23             = _mm_mul_ps(iq2,jq3);
135     qq31             = _mm_mul_ps(iq3,jq1);
136     qq32             = _mm_mul_ps(iq3,jq2);
137     qq33             = _mm_mul_ps(iq3,jq3);
138
139     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
140     rcutoff_scalar   = fr->rcoulomb;
141     rcutoff          = _mm_set1_ps(rcutoff_scalar);
142     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
143
144     rswitch_scalar   = fr->rcoulomb_switch;
145     rswitch          = _mm_set1_ps(rswitch_scalar);
146     /* Setup switch parameters */
147     d_scalar         = rcutoff_scalar-rswitch_scalar;
148     d                = _mm_set1_ps(d_scalar);
149     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
150     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
152     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
153     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
154     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
155
156     /* Avoid stupid compiler warnings */
157     jnrA = jnrB = jnrC = jnrD = 0;
158     j_coord_offsetA = 0;
159     j_coord_offsetB = 0;
160     j_coord_offsetC = 0;
161     j_coord_offsetD = 0;
162
163     outeriter        = 0;
164     inneriter        = 0;
165
166     for(iidx=0;iidx<4*DIM;iidx++)
167     {
168         scratch[iidx] = 0.0;
169     }  
170
171     /* Start outer loop over neighborlists */
172     for(iidx=0; iidx<nri; iidx++)
173     {
174         /* Load shift vector for this list */
175         i_shift_offset   = DIM*shiftidx[iidx];
176
177         /* Load limits for loop over neighbors */
178         j_index_start    = jindex[iidx];
179         j_index_end      = jindex[iidx+1];
180
181         /* Get outer coordinate index */
182         inr              = iinr[iidx];
183         i_coord_offset   = DIM*inr;
184
185         /* Load i particle coords and add shift vector */
186         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
187                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
188         
189         fix1             = _mm_setzero_ps();
190         fiy1             = _mm_setzero_ps();
191         fiz1             = _mm_setzero_ps();
192         fix2             = _mm_setzero_ps();
193         fiy2             = _mm_setzero_ps();
194         fiz2             = _mm_setzero_ps();
195         fix3             = _mm_setzero_ps();
196         fiy3             = _mm_setzero_ps();
197         fiz3             = _mm_setzero_ps();
198
199         /* Reset potential sums */
200         velecsum         = _mm_setzero_ps();
201
202         /* Start inner kernel loop */
203         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
204         {
205
206             /* Get j neighbor index, and coordinate index */
207             jnrA             = jjnr[jidx];
208             jnrB             = jjnr[jidx+1];
209             jnrC             = jjnr[jidx+2];
210             jnrD             = jjnr[jidx+3];
211             j_coord_offsetA  = DIM*jnrA;
212             j_coord_offsetB  = DIM*jnrB;
213             j_coord_offsetC  = DIM*jnrC;
214             j_coord_offsetD  = DIM*jnrD;
215
216             /* load j atom coordinates */
217             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
218                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
219                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
220
221             /* Calculate displacement vector */
222             dx11             = _mm_sub_ps(ix1,jx1);
223             dy11             = _mm_sub_ps(iy1,jy1);
224             dz11             = _mm_sub_ps(iz1,jz1);
225             dx12             = _mm_sub_ps(ix1,jx2);
226             dy12             = _mm_sub_ps(iy1,jy2);
227             dz12             = _mm_sub_ps(iz1,jz2);
228             dx13             = _mm_sub_ps(ix1,jx3);
229             dy13             = _mm_sub_ps(iy1,jy3);
230             dz13             = _mm_sub_ps(iz1,jz3);
231             dx21             = _mm_sub_ps(ix2,jx1);
232             dy21             = _mm_sub_ps(iy2,jy1);
233             dz21             = _mm_sub_ps(iz2,jz1);
234             dx22             = _mm_sub_ps(ix2,jx2);
235             dy22             = _mm_sub_ps(iy2,jy2);
236             dz22             = _mm_sub_ps(iz2,jz2);
237             dx23             = _mm_sub_ps(ix2,jx3);
238             dy23             = _mm_sub_ps(iy2,jy3);
239             dz23             = _mm_sub_ps(iz2,jz3);
240             dx31             = _mm_sub_ps(ix3,jx1);
241             dy31             = _mm_sub_ps(iy3,jy1);
242             dz31             = _mm_sub_ps(iz3,jz1);
243             dx32             = _mm_sub_ps(ix3,jx2);
244             dy32             = _mm_sub_ps(iy3,jy2);
245             dz32             = _mm_sub_ps(iz3,jz2);
246             dx33             = _mm_sub_ps(ix3,jx3);
247             dy33             = _mm_sub_ps(iy3,jy3);
248             dz33             = _mm_sub_ps(iz3,jz3);
249
250             /* Calculate squared distance and things based on it */
251             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
252             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
253             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
254             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
255             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
256             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
257             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
258             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
259             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
260
261             rinv11           = gmx_mm_invsqrt_ps(rsq11);
262             rinv12           = gmx_mm_invsqrt_ps(rsq12);
263             rinv13           = gmx_mm_invsqrt_ps(rsq13);
264             rinv21           = gmx_mm_invsqrt_ps(rsq21);
265             rinv22           = gmx_mm_invsqrt_ps(rsq22);
266             rinv23           = gmx_mm_invsqrt_ps(rsq23);
267             rinv31           = gmx_mm_invsqrt_ps(rsq31);
268             rinv32           = gmx_mm_invsqrt_ps(rsq32);
269             rinv33           = gmx_mm_invsqrt_ps(rsq33);
270
271             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
272             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
273             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
274             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
275             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
276             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
277             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
278             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
279             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
280
281             fjx1             = _mm_setzero_ps();
282             fjy1             = _mm_setzero_ps();
283             fjz1             = _mm_setzero_ps();
284             fjx2             = _mm_setzero_ps();
285             fjy2             = _mm_setzero_ps();
286             fjz2             = _mm_setzero_ps();
287             fjx3             = _mm_setzero_ps();
288             fjy3             = _mm_setzero_ps();
289             fjz3             = _mm_setzero_ps();
290
291             /**************************
292              * CALCULATE INTERACTIONS *
293              **************************/
294
295             if (gmx_mm_any_lt(rsq11,rcutoff2))
296             {
297
298             r11              = _mm_mul_ps(rsq11,rinv11);
299
300             /* EWALD ELECTROSTATICS */
301
302             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
303             ewrt             = _mm_mul_ps(r11,ewtabscale);
304             ewitab           = _mm_cvttps_epi32(ewrt);
305             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
306             ewitab           = _mm_slli_epi32(ewitab,2);
307             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
308             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
309             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
310             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
311             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
312             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
313             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
314             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
315             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
316
317             d                = _mm_sub_ps(r11,rswitch);
318             d                = _mm_max_ps(d,_mm_setzero_ps());
319             d2               = _mm_mul_ps(d,d);
320             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
321
322             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
323
324             /* Evaluate switch function */
325             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
326             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
327             velec            = _mm_mul_ps(velec,sw);
328             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
329
330             /* Update potential sum for this i atom from the interaction with this j atom. */
331             velec            = _mm_and_ps(velec,cutoff_mask);
332             velecsum         = _mm_add_ps(velecsum,velec);
333
334             fscal            = felec;
335
336             fscal            = _mm_and_ps(fscal,cutoff_mask);
337
338             /* Calculate temporary vectorial force */
339             tx               = _mm_mul_ps(fscal,dx11);
340             ty               = _mm_mul_ps(fscal,dy11);
341             tz               = _mm_mul_ps(fscal,dz11);
342
343             /* Update vectorial force */
344             fix1             = _mm_add_ps(fix1,tx);
345             fiy1             = _mm_add_ps(fiy1,ty);
346             fiz1             = _mm_add_ps(fiz1,tz);
347
348             fjx1             = _mm_add_ps(fjx1,tx);
349             fjy1             = _mm_add_ps(fjy1,ty);
350             fjz1             = _mm_add_ps(fjz1,tz);
351             
352             }
353
354             /**************************
355              * CALCULATE INTERACTIONS *
356              **************************/
357
358             if (gmx_mm_any_lt(rsq12,rcutoff2))
359             {
360
361             r12              = _mm_mul_ps(rsq12,rinv12);
362
363             /* EWALD ELECTROSTATICS */
364
365             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
366             ewrt             = _mm_mul_ps(r12,ewtabscale);
367             ewitab           = _mm_cvttps_epi32(ewrt);
368             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
369             ewitab           = _mm_slli_epi32(ewitab,2);
370             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
371             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
372             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
373             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
374             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
375             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
376             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
377             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
378             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
379
380             d                = _mm_sub_ps(r12,rswitch);
381             d                = _mm_max_ps(d,_mm_setzero_ps());
382             d2               = _mm_mul_ps(d,d);
383             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
384
385             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
386
387             /* Evaluate switch function */
388             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
389             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
390             velec            = _mm_mul_ps(velec,sw);
391             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velec            = _mm_and_ps(velec,cutoff_mask);
395             velecsum         = _mm_add_ps(velecsum,velec);
396
397             fscal            = felec;
398
399             fscal            = _mm_and_ps(fscal,cutoff_mask);
400
401             /* Calculate temporary vectorial force */
402             tx               = _mm_mul_ps(fscal,dx12);
403             ty               = _mm_mul_ps(fscal,dy12);
404             tz               = _mm_mul_ps(fscal,dz12);
405
406             /* Update vectorial force */
407             fix1             = _mm_add_ps(fix1,tx);
408             fiy1             = _mm_add_ps(fiy1,ty);
409             fiz1             = _mm_add_ps(fiz1,tz);
410
411             fjx2             = _mm_add_ps(fjx2,tx);
412             fjy2             = _mm_add_ps(fjy2,ty);
413             fjz2             = _mm_add_ps(fjz2,tz);
414             
415             }
416
417             /**************************
418              * CALCULATE INTERACTIONS *
419              **************************/
420
421             if (gmx_mm_any_lt(rsq13,rcutoff2))
422             {
423
424             r13              = _mm_mul_ps(rsq13,rinv13);
425
426             /* EWALD ELECTROSTATICS */
427
428             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429             ewrt             = _mm_mul_ps(r13,ewtabscale);
430             ewitab           = _mm_cvttps_epi32(ewrt);
431             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
432             ewitab           = _mm_slli_epi32(ewitab,2);
433             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
434             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
435             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
436             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
437             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
438             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
439             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
440             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
441             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
442
443             d                = _mm_sub_ps(r13,rswitch);
444             d                = _mm_max_ps(d,_mm_setzero_ps());
445             d2               = _mm_mul_ps(d,d);
446             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
447
448             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
449
450             /* Evaluate switch function */
451             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
452             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
453             velec            = _mm_mul_ps(velec,sw);
454             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
455
456             /* Update potential sum for this i atom from the interaction with this j atom. */
457             velec            = _mm_and_ps(velec,cutoff_mask);
458             velecsum         = _mm_add_ps(velecsum,velec);
459
460             fscal            = felec;
461
462             fscal            = _mm_and_ps(fscal,cutoff_mask);
463
464             /* Calculate temporary vectorial force */
465             tx               = _mm_mul_ps(fscal,dx13);
466             ty               = _mm_mul_ps(fscal,dy13);
467             tz               = _mm_mul_ps(fscal,dz13);
468
469             /* Update vectorial force */
470             fix1             = _mm_add_ps(fix1,tx);
471             fiy1             = _mm_add_ps(fiy1,ty);
472             fiz1             = _mm_add_ps(fiz1,tz);
473
474             fjx3             = _mm_add_ps(fjx3,tx);
475             fjy3             = _mm_add_ps(fjy3,ty);
476             fjz3             = _mm_add_ps(fjz3,tz);
477             
478             }
479
480             /**************************
481              * CALCULATE INTERACTIONS *
482              **************************/
483
484             if (gmx_mm_any_lt(rsq21,rcutoff2))
485             {
486
487             r21              = _mm_mul_ps(rsq21,rinv21);
488
489             /* EWALD ELECTROSTATICS */
490
491             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
492             ewrt             = _mm_mul_ps(r21,ewtabscale);
493             ewitab           = _mm_cvttps_epi32(ewrt);
494             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
495             ewitab           = _mm_slli_epi32(ewitab,2);
496             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
497             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
498             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
499             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
500             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
501             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
502             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
503             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
504             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
505
506             d                = _mm_sub_ps(r21,rswitch);
507             d                = _mm_max_ps(d,_mm_setzero_ps());
508             d2               = _mm_mul_ps(d,d);
509             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
510
511             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
512
513             /* Evaluate switch function */
514             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
515             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
516             velec            = _mm_mul_ps(velec,sw);
517             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
518
519             /* Update potential sum for this i atom from the interaction with this j atom. */
520             velec            = _mm_and_ps(velec,cutoff_mask);
521             velecsum         = _mm_add_ps(velecsum,velec);
522
523             fscal            = felec;
524
525             fscal            = _mm_and_ps(fscal,cutoff_mask);
526
527             /* Calculate temporary vectorial force */
528             tx               = _mm_mul_ps(fscal,dx21);
529             ty               = _mm_mul_ps(fscal,dy21);
530             tz               = _mm_mul_ps(fscal,dz21);
531
532             /* Update vectorial force */
533             fix2             = _mm_add_ps(fix2,tx);
534             fiy2             = _mm_add_ps(fiy2,ty);
535             fiz2             = _mm_add_ps(fiz2,tz);
536
537             fjx1             = _mm_add_ps(fjx1,tx);
538             fjy1             = _mm_add_ps(fjy1,ty);
539             fjz1             = _mm_add_ps(fjz1,tz);
540             
541             }
542
543             /**************************
544              * CALCULATE INTERACTIONS *
545              **************************/
546
547             if (gmx_mm_any_lt(rsq22,rcutoff2))
548             {
549
550             r22              = _mm_mul_ps(rsq22,rinv22);
551
552             /* EWALD ELECTROSTATICS */
553
554             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
555             ewrt             = _mm_mul_ps(r22,ewtabscale);
556             ewitab           = _mm_cvttps_epi32(ewrt);
557             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
558             ewitab           = _mm_slli_epi32(ewitab,2);
559             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
560             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
561             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
562             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
563             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
564             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
565             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
566             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
567             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
568
569             d                = _mm_sub_ps(r22,rswitch);
570             d                = _mm_max_ps(d,_mm_setzero_ps());
571             d2               = _mm_mul_ps(d,d);
572             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
573
574             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
575
576             /* Evaluate switch function */
577             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
578             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
579             velec            = _mm_mul_ps(velec,sw);
580             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
581
582             /* Update potential sum for this i atom from the interaction with this j atom. */
583             velec            = _mm_and_ps(velec,cutoff_mask);
584             velecsum         = _mm_add_ps(velecsum,velec);
585
586             fscal            = felec;
587
588             fscal            = _mm_and_ps(fscal,cutoff_mask);
589
590             /* Calculate temporary vectorial force */
591             tx               = _mm_mul_ps(fscal,dx22);
592             ty               = _mm_mul_ps(fscal,dy22);
593             tz               = _mm_mul_ps(fscal,dz22);
594
595             /* Update vectorial force */
596             fix2             = _mm_add_ps(fix2,tx);
597             fiy2             = _mm_add_ps(fiy2,ty);
598             fiz2             = _mm_add_ps(fiz2,tz);
599
600             fjx2             = _mm_add_ps(fjx2,tx);
601             fjy2             = _mm_add_ps(fjy2,ty);
602             fjz2             = _mm_add_ps(fjz2,tz);
603             
604             }
605
606             /**************************
607              * CALCULATE INTERACTIONS *
608              **************************/
609
610             if (gmx_mm_any_lt(rsq23,rcutoff2))
611             {
612
613             r23              = _mm_mul_ps(rsq23,rinv23);
614
615             /* EWALD ELECTROSTATICS */
616
617             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
618             ewrt             = _mm_mul_ps(r23,ewtabscale);
619             ewitab           = _mm_cvttps_epi32(ewrt);
620             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
621             ewitab           = _mm_slli_epi32(ewitab,2);
622             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
623             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
624             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
625             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
626             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
627             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
628             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
629             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
630             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
631
632             d                = _mm_sub_ps(r23,rswitch);
633             d                = _mm_max_ps(d,_mm_setzero_ps());
634             d2               = _mm_mul_ps(d,d);
635             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
636
637             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
638
639             /* Evaluate switch function */
640             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
641             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
642             velec            = _mm_mul_ps(velec,sw);
643             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
644
645             /* Update potential sum for this i atom from the interaction with this j atom. */
646             velec            = _mm_and_ps(velec,cutoff_mask);
647             velecsum         = _mm_add_ps(velecsum,velec);
648
649             fscal            = felec;
650
651             fscal            = _mm_and_ps(fscal,cutoff_mask);
652
653             /* Calculate temporary vectorial force */
654             tx               = _mm_mul_ps(fscal,dx23);
655             ty               = _mm_mul_ps(fscal,dy23);
656             tz               = _mm_mul_ps(fscal,dz23);
657
658             /* Update vectorial force */
659             fix2             = _mm_add_ps(fix2,tx);
660             fiy2             = _mm_add_ps(fiy2,ty);
661             fiz2             = _mm_add_ps(fiz2,tz);
662
663             fjx3             = _mm_add_ps(fjx3,tx);
664             fjy3             = _mm_add_ps(fjy3,ty);
665             fjz3             = _mm_add_ps(fjz3,tz);
666             
667             }
668
669             /**************************
670              * CALCULATE INTERACTIONS *
671              **************************/
672
673             if (gmx_mm_any_lt(rsq31,rcutoff2))
674             {
675
676             r31              = _mm_mul_ps(rsq31,rinv31);
677
678             /* EWALD ELECTROSTATICS */
679
680             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
681             ewrt             = _mm_mul_ps(r31,ewtabscale);
682             ewitab           = _mm_cvttps_epi32(ewrt);
683             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
684             ewitab           = _mm_slli_epi32(ewitab,2);
685             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
686             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
687             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
688             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
689             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
690             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
691             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
692             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
693             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
694
695             d                = _mm_sub_ps(r31,rswitch);
696             d                = _mm_max_ps(d,_mm_setzero_ps());
697             d2               = _mm_mul_ps(d,d);
698             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
699
700             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
701
702             /* Evaluate switch function */
703             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
704             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
705             velec            = _mm_mul_ps(velec,sw);
706             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
707
708             /* Update potential sum for this i atom from the interaction with this j atom. */
709             velec            = _mm_and_ps(velec,cutoff_mask);
710             velecsum         = _mm_add_ps(velecsum,velec);
711
712             fscal            = felec;
713
714             fscal            = _mm_and_ps(fscal,cutoff_mask);
715
716             /* Calculate temporary vectorial force */
717             tx               = _mm_mul_ps(fscal,dx31);
718             ty               = _mm_mul_ps(fscal,dy31);
719             tz               = _mm_mul_ps(fscal,dz31);
720
721             /* Update vectorial force */
722             fix3             = _mm_add_ps(fix3,tx);
723             fiy3             = _mm_add_ps(fiy3,ty);
724             fiz3             = _mm_add_ps(fiz3,tz);
725
726             fjx1             = _mm_add_ps(fjx1,tx);
727             fjy1             = _mm_add_ps(fjy1,ty);
728             fjz1             = _mm_add_ps(fjz1,tz);
729             
730             }
731
732             /**************************
733              * CALCULATE INTERACTIONS *
734              **************************/
735
736             if (gmx_mm_any_lt(rsq32,rcutoff2))
737             {
738
739             r32              = _mm_mul_ps(rsq32,rinv32);
740
741             /* EWALD ELECTROSTATICS */
742
743             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
744             ewrt             = _mm_mul_ps(r32,ewtabscale);
745             ewitab           = _mm_cvttps_epi32(ewrt);
746             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
747             ewitab           = _mm_slli_epi32(ewitab,2);
748             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
749             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
750             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
751             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
752             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
753             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
754             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
755             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
756             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
757
758             d                = _mm_sub_ps(r32,rswitch);
759             d                = _mm_max_ps(d,_mm_setzero_ps());
760             d2               = _mm_mul_ps(d,d);
761             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
762
763             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
764
765             /* Evaluate switch function */
766             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
767             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
768             velec            = _mm_mul_ps(velec,sw);
769             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
770
771             /* Update potential sum for this i atom from the interaction with this j atom. */
772             velec            = _mm_and_ps(velec,cutoff_mask);
773             velecsum         = _mm_add_ps(velecsum,velec);
774
775             fscal            = felec;
776
777             fscal            = _mm_and_ps(fscal,cutoff_mask);
778
779             /* Calculate temporary vectorial force */
780             tx               = _mm_mul_ps(fscal,dx32);
781             ty               = _mm_mul_ps(fscal,dy32);
782             tz               = _mm_mul_ps(fscal,dz32);
783
784             /* Update vectorial force */
785             fix3             = _mm_add_ps(fix3,tx);
786             fiy3             = _mm_add_ps(fiy3,ty);
787             fiz3             = _mm_add_ps(fiz3,tz);
788
789             fjx2             = _mm_add_ps(fjx2,tx);
790             fjy2             = _mm_add_ps(fjy2,ty);
791             fjz2             = _mm_add_ps(fjz2,tz);
792             
793             }
794
795             /**************************
796              * CALCULATE INTERACTIONS *
797              **************************/
798
799             if (gmx_mm_any_lt(rsq33,rcutoff2))
800             {
801
802             r33              = _mm_mul_ps(rsq33,rinv33);
803
804             /* EWALD ELECTROSTATICS */
805
806             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
807             ewrt             = _mm_mul_ps(r33,ewtabscale);
808             ewitab           = _mm_cvttps_epi32(ewrt);
809             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
810             ewitab           = _mm_slli_epi32(ewitab,2);
811             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
812             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
813             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
814             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
815             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
816             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
817             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
818             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
819             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
820
821             d                = _mm_sub_ps(r33,rswitch);
822             d                = _mm_max_ps(d,_mm_setzero_ps());
823             d2               = _mm_mul_ps(d,d);
824             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
825
826             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
827
828             /* Evaluate switch function */
829             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
830             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
831             velec            = _mm_mul_ps(velec,sw);
832             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
833
834             /* Update potential sum for this i atom from the interaction with this j atom. */
835             velec            = _mm_and_ps(velec,cutoff_mask);
836             velecsum         = _mm_add_ps(velecsum,velec);
837
838             fscal            = felec;
839
840             fscal            = _mm_and_ps(fscal,cutoff_mask);
841
842             /* Calculate temporary vectorial force */
843             tx               = _mm_mul_ps(fscal,dx33);
844             ty               = _mm_mul_ps(fscal,dy33);
845             tz               = _mm_mul_ps(fscal,dz33);
846
847             /* Update vectorial force */
848             fix3             = _mm_add_ps(fix3,tx);
849             fiy3             = _mm_add_ps(fiy3,ty);
850             fiz3             = _mm_add_ps(fiz3,tz);
851
852             fjx3             = _mm_add_ps(fjx3,tx);
853             fjy3             = _mm_add_ps(fjy3,ty);
854             fjz3             = _mm_add_ps(fjz3,tz);
855             
856             }
857
858             fjptrA             = f+j_coord_offsetA;
859             fjptrB             = f+j_coord_offsetB;
860             fjptrC             = f+j_coord_offsetC;
861             fjptrD             = f+j_coord_offsetD;
862
863             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
864                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
865
866             /* Inner loop uses 585 flops */
867         }
868
869         if(jidx<j_index_end)
870         {
871
872             /* Get j neighbor index, and coordinate index */
873             jnrlistA         = jjnr[jidx];
874             jnrlistB         = jjnr[jidx+1];
875             jnrlistC         = jjnr[jidx+2];
876             jnrlistD         = jjnr[jidx+3];
877             /* Sign of each element will be negative for non-real atoms.
878              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
879              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
880              */
881             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
882             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
883             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
884             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
885             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
886             j_coord_offsetA  = DIM*jnrA;
887             j_coord_offsetB  = DIM*jnrB;
888             j_coord_offsetC  = DIM*jnrC;
889             j_coord_offsetD  = DIM*jnrD;
890
891             /* load j atom coordinates */
892             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
893                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
894                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
895
896             /* Calculate displacement vector */
897             dx11             = _mm_sub_ps(ix1,jx1);
898             dy11             = _mm_sub_ps(iy1,jy1);
899             dz11             = _mm_sub_ps(iz1,jz1);
900             dx12             = _mm_sub_ps(ix1,jx2);
901             dy12             = _mm_sub_ps(iy1,jy2);
902             dz12             = _mm_sub_ps(iz1,jz2);
903             dx13             = _mm_sub_ps(ix1,jx3);
904             dy13             = _mm_sub_ps(iy1,jy3);
905             dz13             = _mm_sub_ps(iz1,jz3);
906             dx21             = _mm_sub_ps(ix2,jx1);
907             dy21             = _mm_sub_ps(iy2,jy1);
908             dz21             = _mm_sub_ps(iz2,jz1);
909             dx22             = _mm_sub_ps(ix2,jx2);
910             dy22             = _mm_sub_ps(iy2,jy2);
911             dz22             = _mm_sub_ps(iz2,jz2);
912             dx23             = _mm_sub_ps(ix2,jx3);
913             dy23             = _mm_sub_ps(iy2,jy3);
914             dz23             = _mm_sub_ps(iz2,jz3);
915             dx31             = _mm_sub_ps(ix3,jx1);
916             dy31             = _mm_sub_ps(iy3,jy1);
917             dz31             = _mm_sub_ps(iz3,jz1);
918             dx32             = _mm_sub_ps(ix3,jx2);
919             dy32             = _mm_sub_ps(iy3,jy2);
920             dz32             = _mm_sub_ps(iz3,jz2);
921             dx33             = _mm_sub_ps(ix3,jx3);
922             dy33             = _mm_sub_ps(iy3,jy3);
923             dz33             = _mm_sub_ps(iz3,jz3);
924
925             /* Calculate squared distance and things based on it */
926             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
927             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
928             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
929             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
930             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
931             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
932             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
933             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
934             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
935
936             rinv11           = gmx_mm_invsqrt_ps(rsq11);
937             rinv12           = gmx_mm_invsqrt_ps(rsq12);
938             rinv13           = gmx_mm_invsqrt_ps(rsq13);
939             rinv21           = gmx_mm_invsqrt_ps(rsq21);
940             rinv22           = gmx_mm_invsqrt_ps(rsq22);
941             rinv23           = gmx_mm_invsqrt_ps(rsq23);
942             rinv31           = gmx_mm_invsqrt_ps(rsq31);
943             rinv32           = gmx_mm_invsqrt_ps(rsq32);
944             rinv33           = gmx_mm_invsqrt_ps(rsq33);
945
946             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
947             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
948             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
949             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
950             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
951             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
952             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
953             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
954             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
955
956             fjx1             = _mm_setzero_ps();
957             fjy1             = _mm_setzero_ps();
958             fjz1             = _mm_setzero_ps();
959             fjx2             = _mm_setzero_ps();
960             fjy2             = _mm_setzero_ps();
961             fjz2             = _mm_setzero_ps();
962             fjx3             = _mm_setzero_ps();
963             fjy3             = _mm_setzero_ps();
964             fjz3             = _mm_setzero_ps();
965
966             /**************************
967              * CALCULATE INTERACTIONS *
968              **************************/
969
970             if (gmx_mm_any_lt(rsq11,rcutoff2))
971             {
972
973             r11              = _mm_mul_ps(rsq11,rinv11);
974             r11              = _mm_andnot_ps(dummy_mask,r11);
975
976             /* EWALD ELECTROSTATICS */
977
978             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
979             ewrt             = _mm_mul_ps(r11,ewtabscale);
980             ewitab           = _mm_cvttps_epi32(ewrt);
981             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
982             ewitab           = _mm_slli_epi32(ewitab,2);
983             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
984             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
985             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
986             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
987             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
988             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
989             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
990             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
991             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
992
993             d                = _mm_sub_ps(r11,rswitch);
994             d                = _mm_max_ps(d,_mm_setzero_ps());
995             d2               = _mm_mul_ps(d,d);
996             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
997
998             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
999
1000             /* Evaluate switch function */
1001             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1002             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1003             velec            = _mm_mul_ps(velec,sw);
1004             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1005
1006             /* Update potential sum for this i atom from the interaction with this j atom. */
1007             velec            = _mm_and_ps(velec,cutoff_mask);
1008             velec            = _mm_andnot_ps(dummy_mask,velec);
1009             velecsum         = _mm_add_ps(velecsum,velec);
1010
1011             fscal            = felec;
1012
1013             fscal            = _mm_and_ps(fscal,cutoff_mask);
1014
1015             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1016
1017             /* Calculate temporary vectorial force */
1018             tx               = _mm_mul_ps(fscal,dx11);
1019             ty               = _mm_mul_ps(fscal,dy11);
1020             tz               = _mm_mul_ps(fscal,dz11);
1021
1022             /* Update vectorial force */
1023             fix1             = _mm_add_ps(fix1,tx);
1024             fiy1             = _mm_add_ps(fiy1,ty);
1025             fiz1             = _mm_add_ps(fiz1,tz);
1026
1027             fjx1             = _mm_add_ps(fjx1,tx);
1028             fjy1             = _mm_add_ps(fjy1,ty);
1029             fjz1             = _mm_add_ps(fjz1,tz);
1030             
1031             }
1032
1033             /**************************
1034              * CALCULATE INTERACTIONS *
1035              **************************/
1036
1037             if (gmx_mm_any_lt(rsq12,rcutoff2))
1038             {
1039
1040             r12              = _mm_mul_ps(rsq12,rinv12);
1041             r12              = _mm_andnot_ps(dummy_mask,r12);
1042
1043             /* EWALD ELECTROSTATICS */
1044
1045             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1046             ewrt             = _mm_mul_ps(r12,ewtabscale);
1047             ewitab           = _mm_cvttps_epi32(ewrt);
1048             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1049             ewitab           = _mm_slli_epi32(ewitab,2);
1050             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1051             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1052             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1053             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1054             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1055             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1056             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1057             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1058             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1059
1060             d                = _mm_sub_ps(r12,rswitch);
1061             d                = _mm_max_ps(d,_mm_setzero_ps());
1062             d2               = _mm_mul_ps(d,d);
1063             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1064
1065             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1066
1067             /* Evaluate switch function */
1068             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1069             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1070             velec            = _mm_mul_ps(velec,sw);
1071             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1072
1073             /* Update potential sum for this i atom from the interaction with this j atom. */
1074             velec            = _mm_and_ps(velec,cutoff_mask);
1075             velec            = _mm_andnot_ps(dummy_mask,velec);
1076             velecsum         = _mm_add_ps(velecsum,velec);
1077
1078             fscal            = felec;
1079
1080             fscal            = _mm_and_ps(fscal,cutoff_mask);
1081
1082             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1083
1084             /* Calculate temporary vectorial force */
1085             tx               = _mm_mul_ps(fscal,dx12);
1086             ty               = _mm_mul_ps(fscal,dy12);
1087             tz               = _mm_mul_ps(fscal,dz12);
1088
1089             /* Update vectorial force */
1090             fix1             = _mm_add_ps(fix1,tx);
1091             fiy1             = _mm_add_ps(fiy1,ty);
1092             fiz1             = _mm_add_ps(fiz1,tz);
1093
1094             fjx2             = _mm_add_ps(fjx2,tx);
1095             fjy2             = _mm_add_ps(fjy2,ty);
1096             fjz2             = _mm_add_ps(fjz2,tz);
1097             
1098             }
1099
1100             /**************************
1101              * CALCULATE INTERACTIONS *
1102              **************************/
1103
1104             if (gmx_mm_any_lt(rsq13,rcutoff2))
1105             {
1106
1107             r13              = _mm_mul_ps(rsq13,rinv13);
1108             r13              = _mm_andnot_ps(dummy_mask,r13);
1109
1110             /* EWALD ELECTROSTATICS */
1111
1112             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1113             ewrt             = _mm_mul_ps(r13,ewtabscale);
1114             ewitab           = _mm_cvttps_epi32(ewrt);
1115             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1116             ewitab           = _mm_slli_epi32(ewitab,2);
1117             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1118             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1119             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1120             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1121             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1122             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1123             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1124             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1125             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1126
1127             d                = _mm_sub_ps(r13,rswitch);
1128             d                = _mm_max_ps(d,_mm_setzero_ps());
1129             d2               = _mm_mul_ps(d,d);
1130             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1131
1132             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1133
1134             /* Evaluate switch function */
1135             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1136             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1137             velec            = _mm_mul_ps(velec,sw);
1138             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1139
1140             /* Update potential sum for this i atom from the interaction with this j atom. */
1141             velec            = _mm_and_ps(velec,cutoff_mask);
1142             velec            = _mm_andnot_ps(dummy_mask,velec);
1143             velecsum         = _mm_add_ps(velecsum,velec);
1144
1145             fscal            = felec;
1146
1147             fscal            = _mm_and_ps(fscal,cutoff_mask);
1148
1149             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1150
1151             /* Calculate temporary vectorial force */
1152             tx               = _mm_mul_ps(fscal,dx13);
1153             ty               = _mm_mul_ps(fscal,dy13);
1154             tz               = _mm_mul_ps(fscal,dz13);
1155
1156             /* Update vectorial force */
1157             fix1             = _mm_add_ps(fix1,tx);
1158             fiy1             = _mm_add_ps(fiy1,ty);
1159             fiz1             = _mm_add_ps(fiz1,tz);
1160
1161             fjx3             = _mm_add_ps(fjx3,tx);
1162             fjy3             = _mm_add_ps(fjy3,ty);
1163             fjz3             = _mm_add_ps(fjz3,tz);
1164             
1165             }
1166
1167             /**************************
1168              * CALCULATE INTERACTIONS *
1169              **************************/
1170
1171             if (gmx_mm_any_lt(rsq21,rcutoff2))
1172             {
1173
1174             r21              = _mm_mul_ps(rsq21,rinv21);
1175             r21              = _mm_andnot_ps(dummy_mask,r21);
1176
1177             /* EWALD ELECTROSTATICS */
1178
1179             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1180             ewrt             = _mm_mul_ps(r21,ewtabscale);
1181             ewitab           = _mm_cvttps_epi32(ewrt);
1182             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1183             ewitab           = _mm_slli_epi32(ewitab,2);
1184             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1185             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1186             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1187             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1188             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1189             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1190             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1191             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1192             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1193
1194             d                = _mm_sub_ps(r21,rswitch);
1195             d                = _mm_max_ps(d,_mm_setzero_ps());
1196             d2               = _mm_mul_ps(d,d);
1197             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1198
1199             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1200
1201             /* Evaluate switch function */
1202             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1203             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1204             velec            = _mm_mul_ps(velec,sw);
1205             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1206
1207             /* Update potential sum for this i atom from the interaction with this j atom. */
1208             velec            = _mm_and_ps(velec,cutoff_mask);
1209             velec            = _mm_andnot_ps(dummy_mask,velec);
1210             velecsum         = _mm_add_ps(velecsum,velec);
1211
1212             fscal            = felec;
1213
1214             fscal            = _mm_and_ps(fscal,cutoff_mask);
1215
1216             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1217
1218             /* Calculate temporary vectorial force */
1219             tx               = _mm_mul_ps(fscal,dx21);
1220             ty               = _mm_mul_ps(fscal,dy21);
1221             tz               = _mm_mul_ps(fscal,dz21);
1222
1223             /* Update vectorial force */
1224             fix2             = _mm_add_ps(fix2,tx);
1225             fiy2             = _mm_add_ps(fiy2,ty);
1226             fiz2             = _mm_add_ps(fiz2,tz);
1227
1228             fjx1             = _mm_add_ps(fjx1,tx);
1229             fjy1             = _mm_add_ps(fjy1,ty);
1230             fjz1             = _mm_add_ps(fjz1,tz);
1231             
1232             }
1233
1234             /**************************
1235              * CALCULATE INTERACTIONS *
1236              **************************/
1237
1238             if (gmx_mm_any_lt(rsq22,rcutoff2))
1239             {
1240
1241             r22              = _mm_mul_ps(rsq22,rinv22);
1242             r22              = _mm_andnot_ps(dummy_mask,r22);
1243
1244             /* EWALD ELECTROSTATICS */
1245
1246             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1247             ewrt             = _mm_mul_ps(r22,ewtabscale);
1248             ewitab           = _mm_cvttps_epi32(ewrt);
1249             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1250             ewitab           = _mm_slli_epi32(ewitab,2);
1251             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1252             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1253             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1254             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1255             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1256             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1257             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1258             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1259             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1260
1261             d                = _mm_sub_ps(r22,rswitch);
1262             d                = _mm_max_ps(d,_mm_setzero_ps());
1263             d2               = _mm_mul_ps(d,d);
1264             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1265
1266             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1267
1268             /* Evaluate switch function */
1269             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1270             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1271             velec            = _mm_mul_ps(velec,sw);
1272             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1273
1274             /* Update potential sum for this i atom from the interaction with this j atom. */
1275             velec            = _mm_and_ps(velec,cutoff_mask);
1276             velec            = _mm_andnot_ps(dummy_mask,velec);
1277             velecsum         = _mm_add_ps(velecsum,velec);
1278
1279             fscal            = felec;
1280
1281             fscal            = _mm_and_ps(fscal,cutoff_mask);
1282
1283             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1284
1285             /* Calculate temporary vectorial force */
1286             tx               = _mm_mul_ps(fscal,dx22);
1287             ty               = _mm_mul_ps(fscal,dy22);
1288             tz               = _mm_mul_ps(fscal,dz22);
1289
1290             /* Update vectorial force */
1291             fix2             = _mm_add_ps(fix2,tx);
1292             fiy2             = _mm_add_ps(fiy2,ty);
1293             fiz2             = _mm_add_ps(fiz2,tz);
1294
1295             fjx2             = _mm_add_ps(fjx2,tx);
1296             fjy2             = _mm_add_ps(fjy2,ty);
1297             fjz2             = _mm_add_ps(fjz2,tz);
1298             
1299             }
1300
1301             /**************************
1302              * CALCULATE INTERACTIONS *
1303              **************************/
1304
1305             if (gmx_mm_any_lt(rsq23,rcutoff2))
1306             {
1307
1308             r23              = _mm_mul_ps(rsq23,rinv23);
1309             r23              = _mm_andnot_ps(dummy_mask,r23);
1310
1311             /* EWALD ELECTROSTATICS */
1312
1313             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1314             ewrt             = _mm_mul_ps(r23,ewtabscale);
1315             ewitab           = _mm_cvttps_epi32(ewrt);
1316             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1317             ewitab           = _mm_slli_epi32(ewitab,2);
1318             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1319             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1320             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1321             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1322             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1323             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1324             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1325             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1326             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1327
1328             d                = _mm_sub_ps(r23,rswitch);
1329             d                = _mm_max_ps(d,_mm_setzero_ps());
1330             d2               = _mm_mul_ps(d,d);
1331             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1332
1333             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1334
1335             /* Evaluate switch function */
1336             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1337             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1338             velec            = _mm_mul_ps(velec,sw);
1339             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1340
1341             /* Update potential sum for this i atom from the interaction with this j atom. */
1342             velec            = _mm_and_ps(velec,cutoff_mask);
1343             velec            = _mm_andnot_ps(dummy_mask,velec);
1344             velecsum         = _mm_add_ps(velecsum,velec);
1345
1346             fscal            = felec;
1347
1348             fscal            = _mm_and_ps(fscal,cutoff_mask);
1349
1350             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1351
1352             /* Calculate temporary vectorial force */
1353             tx               = _mm_mul_ps(fscal,dx23);
1354             ty               = _mm_mul_ps(fscal,dy23);
1355             tz               = _mm_mul_ps(fscal,dz23);
1356
1357             /* Update vectorial force */
1358             fix2             = _mm_add_ps(fix2,tx);
1359             fiy2             = _mm_add_ps(fiy2,ty);
1360             fiz2             = _mm_add_ps(fiz2,tz);
1361
1362             fjx3             = _mm_add_ps(fjx3,tx);
1363             fjy3             = _mm_add_ps(fjy3,ty);
1364             fjz3             = _mm_add_ps(fjz3,tz);
1365             
1366             }
1367
1368             /**************************
1369              * CALCULATE INTERACTIONS *
1370              **************************/
1371
1372             if (gmx_mm_any_lt(rsq31,rcutoff2))
1373             {
1374
1375             r31              = _mm_mul_ps(rsq31,rinv31);
1376             r31              = _mm_andnot_ps(dummy_mask,r31);
1377
1378             /* EWALD ELECTROSTATICS */
1379
1380             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1381             ewrt             = _mm_mul_ps(r31,ewtabscale);
1382             ewitab           = _mm_cvttps_epi32(ewrt);
1383             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1384             ewitab           = _mm_slli_epi32(ewitab,2);
1385             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1386             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1387             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1388             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1389             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1390             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1391             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1392             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1393             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1394
1395             d                = _mm_sub_ps(r31,rswitch);
1396             d                = _mm_max_ps(d,_mm_setzero_ps());
1397             d2               = _mm_mul_ps(d,d);
1398             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1399
1400             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1401
1402             /* Evaluate switch function */
1403             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1404             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1405             velec            = _mm_mul_ps(velec,sw);
1406             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1407
1408             /* Update potential sum for this i atom from the interaction with this j atom. */
1409             velec            = _mm_and_ps(velec,cutoff_mask);
1410             velec            = _mm_andnot_ps(dummy_mask,velec);
1411             velecsum         = _mm_add_ps(velecsum,velec);
1412
1413             fscal            = felec;
1414
1415             fscal            = _mm_and_ps(fscal,cutoff_mask);
1416
1417             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1418
1419             /* Calculate temporary vectorial force */
1420             tx               = _mm_mul_ps(fscal,dx31);
1421             ty               = _mm_mul_ps(fscal,dy31);
1422             tz               = _mm_mul_ps(fscal,dz31);
1423
1424             /* Update vectorial force */
1425             fix3             = _mm_add_ps(fix3,tx);
1426             fiy3             = _mm_add_ps(fiy3,ty);
1427             fiz3             = _mm_add_ps(fiz3,tz);
1428
1429             fjx1             = _mm_add_ps(fjx1,tx);
1430             fjy1             = _mm_add_ps(fjy1,ty);
1431             fjz1             = _mm_add_ps(fjz1,tz);
1432             
1433             }
1434
1435             /**************************
1436              * CALCULATE INTERACTIONS *
1437              **************************/
1438
1439             if (gmx_mm_any_lt(rsq32,rcutoff2))
1440             {
1441
1442             r32              = _mm_mul_ps(rsq32,rinv32);
1443             r32              = _mm_andnot_ps(dummy_mask,r32);
1444
1445             /* EWALD ELECTROSTATICS */
1446
1447             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1448             ewrt             = _mm_mul_ps(r32,ewtabscale);
1449             ewitab           = _mm_cvttps_epi32(ewrt);
1450             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1451             ewitab           = _mm_slli_epi32(ewitab,2);
1452             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1453             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1454             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1455             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1456             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1457             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1458             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1459             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1460             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1461
1462             d                = _mm_sub_ps(r32,rswitch);
1463             d                = _mm_max_ps(d,_mm_setzero_ps());
1464             d2               = _mm_mul_ps(d,d);
1465             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1466
1467             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1468
1469             /* Evaluate switch function */
1470             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1471             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1472             velec            = _mm_mul_ps(velec,sw);
1473             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1474
1475             /* Update potential sum for this i atom from the interaction with this j atom. */
1476             velec            = _mm_and_ps(velec,cutoff_mask);
1477             velec            = _mm_andnot_ps(dummy_mask,velec);
1478             velecsum         = _mm_add_ps(velecsum,velec);
1479
1480             fscal            = felec;
1481
1482             fscal            = _mm_and_ps(fscal,cutoff_mask);
1483
1484             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1485
1486             /* Calculate temporary vectorial force */
1487             tx               = _mm_mul_ps(fscal,dx32);
1488             ty               = _mm_mul_ps(fscal,dy32);
1489             tz               = _mm_mul_ps(fscal,dz32);
1490
1491             /* Update vectorial force */
1492             fix3             = _mm_add_ps(fix3,tx);
1493             fiy3             = _mm_add_ps(fiy3,ty);
1494             fiz3             = _mm_add_ps(fiz3,tz);
1495
1496             fjx2             = _mm_add_ps(fjx2,tx);
1497             fjy2             = _mm_add_ps(fjy2,ty);
1498             fjz2             = _mm_add_ps(fjz2,tz);
1499             
1500             }
1501
1502             /**************************
1503              * CALCULATE INTERACTIONS *
1504              **************************/
1505
1506             if (gmx_mm_any_lt(rsq33,rcutoff2))
1507             {
1508
1509             r33              = _mm_mul_ps(rsq33,rinv33);
1510             r33              = _mm_andnot_ps(dummy_mask,r33);
1511
1512             /* EWALD ELECTROSTATICS */
1513
1514             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1515             ewrt             = _mm_mul_ps(r33,ewtabscale);
1516             ewitab           = _mm_cvttps_epi32(ewrt);
1517             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1518             ewitab           = _mm_slli_epi32(ewitab,2);
1519             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1520             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1521             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1522             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1523             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1524             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1525             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1526             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1527             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1528
1529             d                = _mm_sub_ps(r33,rswitch);
1530             d                = _mm_max_ps(d,_mm_setzero_ps());
1531             d2               = _mm_mul_ps(d,d);
1532             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1533
1534             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1535
1536             /* Evaluate switch function */
1537             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1538             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1539             velec            = _mm_mul_ps(velec,sw);
1540             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1541
1542             /* Update potential sum for this i atom from the interaction with this j atom. */
1543             velec            = _mm_and_ps(velec,cutoff_mask);
1544             velec            = _mm_andnot_ps(dummy_mask,velec);
1545             velecsum         = _mm_add_ps(velecsum,velec);
1546
1547             fscal            = felec;
1548
1549             fscal            = _mm_and_ps(fscal,cutoff_mask);
1550
1551             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1552
1553             /* Calculate temporary vectorial force */
1554             tx               = _mm_mul_ps(fscal,dx33);
1555             ty               = _mm_mul_ps(fscal,dy33);
1556             tz               = _mm_mul_ps(fscal,dz33);
1557
1558             /* Update vectorial force */
1559             fix3             = _mm_add_ps(fix3,tx);
1560             fiy3             = _mm_add_ps(fiy3,ty);
1561             fiz3             = _mm_add_ps(fiz3,tz);
1562
1563             fjx3             = _mm_add_ps(fjx3,tx);
1564             fjy3             = _mm_add_ps(fjy3,ty);
1565             fjz3             = _mm_add_ps(fjz3,tz);
1566             
1567             }
1568
1569             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1570             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1571             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1572             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1573
1574             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1575                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1576
1577             /* Inner loop uses 594 flops */
1578         }
1579
1580         /* End of innermost loop */
1581
1582         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1583                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1584
1585         ggid                        = gid[iidx];
1586         /* Update potential energies */
1587         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1588
1589         /* Increment number of inner iterations */
1590         inneriter                  += j_index_end - j_index_start;
1591
1592         /* Outer loop uses 19 flops */
1593     }
1594
1595     /* Increment number of outer iterations */
1596     outeriter        += nri;
1597
1598     /* Update outer/inner flops */
1599
1600     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
1601 }
1602 /*
1603  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single
1604  * Electrostatics interaction: Ewald
1605  * VdW interaction:            None
1606  * Geometry:                   Water4-Water4
1607  * Calculate force/pot:        Force
1608  */
1609 void
1610 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single
1611                     (t_nblist * gmx_restrict                nlist,
1612                      rvec * gmx_restrict                    xx,
1613                      rvec * gmx_restrict                    ff,
1614                      t_forcerec * gmx_restrict              fr,
1615                      t_mdatoms * gmx_restrict               mdatoms,
1616                      nb_kernel_data_t * gmx_restrict        kernel_data,
1617                      t_nrnb * gmx_restrict                  nrnb)
1618 {
1619     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1620      * just 0 for non-waters.
1621      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1622      * jnr indices corresponding to data put in the four positions in the SIMD register.
1623      */
1624     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1625     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1626     int              jnrA,jnrB,jnrC,jnrD;
1627     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1628     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1629     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1630     real             rcutoff_scalar;
1631     real             *shiftvec,*fshift,*x,*f;
1632     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1633     real             scratch[4*DIM];
1634     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1635     int              vdwioffset1;
1636     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1637     int              vdwioffset2;
1638     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1639     int              vdwioffset3;
1640     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1641     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1642     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1643     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1644     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1645     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1646     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1647     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1648     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1649     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1650     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1651     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1652     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1653     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1654     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1655     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1656     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1657     real             *charge;
1658     __m128i          ewitab;
1659     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1660     real             *ewtab;
1661     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1662     real             rswitch_scalar,d_scalar;
1663     __m128           dummy_mask,cutoff_mask;
1664     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1665     __m128           one     = _mm_set1_ps(1.0);
1666     __m128           two     = _mm_set1_ps(2.0);
1667     x                = xx[0];
1668     f                = ff[0];
1669
1670     nri              = nlist->nri;
1671     iinr             = nlist->iinr;
1672     jindex           = nlist->jindex;
1673     jjnr             = nlist->jjnr;
1674     shiftidx         = nlist->shift;
1675     gid              = nlist->gid;
1676     shiftvec         = fr->shift_vec[0];
1677     fshift           = fr->fshift[0];
1678     facel            = _mm_set1_ps(fr->epsfac);
1679     charge           = mdatoms->chargeA;
1680
1681     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1682     ewtab            = fr->ic->tabq_coul_FDV0;
1683     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1684     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1685
1686     /* Setup water-specific parameters */
1687     inr              = nlist->iinr[0];
1688     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1689     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1690     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1691
1692     jq1              = _mm_set1_ps(charge[inr+1]);
1693     jq2              = _mm_set1_ps(charge[inr+2]);
1694     jq3              = _mm_set1_ps(charge[inr+3]);
1695     qq11             = _mm_mul_ps(iq1,jq1);
1696     qq12             = _mm_mul_ps(iq1,jq2);
1697     qq13             = _mm_mul_ps(iq1,jq3);
1698     qq21             = _mm_mul_ps(iq2,jq1);
1699     qq22             = _mm_mul_ps(iq2,jq2);
1700     qq23             = _mm_mul_ps(iq2,jq3);
1701     qq31             = _mm_mul_ps(iq3,jq1);
1702     qq32             = _mm_mul_ps(iq3,jq2);
1703     qq33             = _mm_mul_ps(iq3,jq3);
1704
1705     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1706     rcutoff_scalar   = fr->rcoulomb;
1707     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1708     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1709
1710     rswitch_scalar   = fr->rcoulomb_switch;
1711     rswitch          = _mm_set1_ps(rswitch_scalar);
1712     /* Setup switch parameters */
1713     d_scalar         = rcutoff_scalar-rswitch_scalar;
1714     d                = _mm_set1_ps(d_scalar);
1715     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1716     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1717     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1718     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1719     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1720     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1721
1722     /* Avoid stupid compiler warnings */
1723     jnrA = jnrB = jnrC = jnrD = 0;
1724     j_coord_offsetA = 0;
1725     j_coord_offsetB = 0;
1726     j_coord_offsetC = 0;
1727     j_coord_offsetD = 0;
1728
1729     outeriter        = 0;
1730     inneriter        = 0;
1731
1732     for(iidx=0;iidx<4*DIM;iidx++)
1733     {
1734         scratch[iidx] = 0.0;
1735     }  
1736
1737     /* Start outer loop over neighborlists */
1738     for(iidx=0; iidx<nri; iidx++)
1739     {
1740         /* Load shift vector for this list */
1741         i_shift_offset   = DIM*shiftidx[iidx];
1742
1743         /* Load limits for loop over neighbors */
1744         j_index_start    = jindex[iidx];
1745         j_index_end      = jindex[iidx+1];
1746
1747         /* Get outer coordinate index */
1748         inr              = iinr[iidx];
1749         i_coord_offset   = DIM*inr;
1750
1751         /* Load i particle coords and add shift vector */
1752         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1753                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1754         
1755         fix1             = _mm_setzero_ps();
1756         fiy1             = _mm_setzero_ps();
1757         fiz1             = _mm_setzero_ps();
1758         fix2             = _mm_setzero_ps();
1759         fiy2             = _mm_setzero_ps();
1760         fiz2             = _mm_setzero_ps();
1761         fix3             = _mm_setzero_ps();
1762         fiy3             = _mm_setzero_ps();
1763         fiz3             = _mm_setzero_ps();
1764
1765         /* Start inner kernel loop */
1766         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1767         {
1768
1769             /* Get j neighbor index, and coordinate index */
1770             jnrA             = jjnr[jidx];
1771             jnrB             = jjnr[jidx+1];
1772             jnrC             = jjnr[jidx+2];
1773             jnrD             = jjnr[jidx+3];
1774             j_coord_offsetA  = DIM*jnrA;
1775             j_coord_offsetB  = DIM*jnrB;
1776             j_coord_offsetC  = DIM*jnrC;
1777             j_coord_offsetD  = DIM*jnrD;
1778
1779             /* load j atom coordinates */
1780             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1781                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1782                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1783
1784             /* Calculate displacement vector */
1785             dx11             = _mm_sub_ps(ix1,jx1);
1786             dy11             = _mm_sub_ps(iy1,jy1);
1787             dz11             = _mm_sub_ps(iz1,jz1);
1788             dx12             = _mm_sub_ps(ix1,jx2);
1789             dy12             = _mm_sub_ps(iy1,jy2);
1790             dz12             = _mm_sub_ps(iz1,jz2);
1791             dx13             = _mm_sub_ps(ix1,jx3);
1792             dy13             = _mm_sub_ps(iy1,jy3);
1793             dz13             = _mm_sub_ps(iz1,jz3);
1794             dx21             = _mm_sub_ps(ix2,jx1);
1795             dy21             = _mm_sub_ps(iy2,jy1);
1796             dz21             = _mm_sub_ps(iz2,jz1);
1797             dx22             = _mm_sub_ps(ix2,jx2);
1798             dy22             = _mm_sub_ps(iy2,jy2);
1799             dz22             = _mm_sub_ps(iz2,jz2);
1800             dx23             = _mm_sub_ps(ix2,jx3);
1801             dy23             = _mm_sub_ps(iy2,jy3);
1802             dz23             = _mm_sub_ps(iz2,jz3);
1803             dx31             = _mm_sub_ps(ix3,jx1);
1804             dy31             = _mm_sub_ps(iy3,jy1);
1805             dz31             = _mm_sub_ps(iz3,jz1);
1806             dx32             = _mm_sub_ps(ix3,jx2);
1807             dy32             = _mm_sub_ps(iy3,jy2);
1808             dz32             = _mm_sub_ps(iz3,jz2);
1809             dx33             = _mm_sub_ps(ix3,jx3);
1810             dy33             = _mm_sub_ps(iy3,jy3);
1811             dz33             = _mm_sub_ps(iz3,jz3);
1812
1813             /* Calculate squared distance and things based on it */
1814             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1815             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1816             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1817             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1818             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1819             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1820             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1821             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1822             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1823
1824             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1825             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1826             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1827             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1828             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1829             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1830             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1831             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1832             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1833
1834             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1835             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1836             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1837             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1838             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1839             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1840             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1841             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1842             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1843
1844             fjx1             = _mm_setzero_ps();
1845             fjy1             = _mm_setzero_ps();
1846             fjz1             = _mm_setzero_ps();
1847             fjx2             = _mm_setzero_ps();
1848             fjy2             = _mm_setzero_ps();
1849             fjz2             = _mm_setzero_ps();
1850             fjx3             = _mm_setzero_ps();
1851             fjy3             = _mm_setzero_ps();
1852             fjz3             = _mm_setzero_ps();
1853
1854             /**************************
1855              * CALCULATE INTERACTIONS *
1856              **************************/
1857
1858             if (gmx_mm_any_lt(rsq11,rcutoff2))
1859             {
1860
1861             r11              = _mm_mul_ps(rsq11,rinv11);
1862
1863             /* EWALD ELECTROSTATICS */
1864
1865             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1866             ewrt             = _mm_mul_ps(r11,ewtabscale);
1867             ewitab           = _mm_cvttps_epi32(ewrt);
1868             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1869             ewitab           = _mm_slli_epi32(ewitab,2);
1870             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1871             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1872             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1873             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1874             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1875             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1876             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1877             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1878             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1879
1880             d                = _mm_sub_ps(r11,rswitch);
1881             d                = _mm_max_ps(d,_mm_setzero_ps());
1882             d2               = _mm_mul_ps(d,d);
1883             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1884
1885             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1886
1887             /* Evaluate switch function */
1888             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1889             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1890             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1891
1892             fscal            = felec;
1893
1894             fscal            = _mm_and_ps(fscal,cutoff_mask);
1895
1896             /* Calculate temporary vectorial force */
1897             tx               = _mm_mul_ps(fscal,dx11);
1898             ty               = _mm_mul_ps(fscal,dy11);
1899             tz               = _mm_mul_ps(fscal,dz11);
1900
1901             /* Update vectorial force */
1902             fix1             = _mm_add_ps(fix1,tx);
1903             fiy1             = _mm_add_ps(fiy1,ty);
1904             fiz1             = _mm_add_ps(fiz1,tz);
1905
1906             fjx1             = _mm_add_ps(fjx1,tx);
1907             fjy1             = _mm_add_ps(fjy1,ty);
1908             fjz1             = _mm_add_ps(fjz1,tz);
1909             
1910             }
1911
1912             /**************************
1913              * CALCULATE INTERACTIONS *
1914              **************************/
1915
1916             if (gmx_mm_any_lt(rsq12,rcutoff2))
1917             {
1918
1919             r12              = _mm_mul_ps(rsq12,rinv12);
1920
1921             /* EWALD ELECTROSTATICS */
1922
1923             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1924             ewrt             = _mm_mul_ps(r12,ewtabscale);
1925             ewitab           = _mm_cvttps_epi32(ewrt);
1926             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1927             ewitab           = _mm_slli_epi32(ewitab,2);
1928             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1929             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1930             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1931             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1932             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1933             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1934             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1935             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1936             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1937
1938             d                = _mm_sub_ps(r12,rswitch);
1939             d                = _mm_max_ps(d,_mm_setzero_ps());
1940             d2               = _mm_mul_ps(d,d);
1941             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1942
1943             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1944
1945             /* Evaluate switch function */
1946             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1947             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1948             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1949
1950             fscal            = felec;
1951
1952             fscal            = _mm_and_ps(fscal,cutoff_mask);
1953
1954             /* Calculate temporary vectorial force */
1955             tx               = _mm_mul_ps(fscal,dx12);
1956             ty               = _mm_mul_ps(fscal,dy12);
1957             tz               = _mm_mul_ps(fscal,dz12);
1958
1959             /* Update vectorial force */
1960             fix1             = _mm_add_ps(fix1,tx);
1961             fiy1             = _mm_add_ps(fiy1,ty);
1962             fiz1             = _mm_add_ps(fiz1,tz);
1963
1964             fjx2             = _mm_add_ps(fjx2,tx);
1965             fjy2             = _mm_add_ps(fjy2,ty);
1966             fjz2             = _mm_add_ps(fjz2,tz);
1967             
1968             }
1969
1970             /**************************
1971              * CALCULATE INTERACTIONS *
1972              **************************/
1973
1974             if (gmx_mm_any_lt(rsq13,rcutoff2))
1975             {
1976
1977             r13              = _mm_mul_ps(rsq13,rinv13);
1978
1979             /* EWALD ELECTROSTATICS */
1980
1981             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1982             ewrt             = _mm_mul_ps(r13,ewtabscale);
1983             ewitab           = _mm_cvttps_epi32(ewrt);
1984             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1985             ewitab           = _mm_slli_epi32(ewitab,2);
1986             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1987             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1988             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1989             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1990             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1991             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1992             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1993             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1994             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1995
1996             d                = _mm_sub_ps(r13,rswitch);
1997             d                = _mm_max_ps(d,_mm_setzero_ps());
1998             d2               = _mm_mul_ps(d,d);
1999             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2000
2001             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2002
2003             /* Evaluate switch function */
2004             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2005             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2006             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2007
2008             fscal            = felec;
2009
2010             fscal            = _mm_and_ps(fscal,cutoff_mask);
2011
2012             /* Calculate temporary vectorial force */
2013             tx               = _mm_mul_ps(fscal,dx13);
2014             ty               = _mm_mul_ps(fscal,dy13);
2015             tz               = _mm_mul_ps(fscal,dz13);
2016
2017             /* Update vectorial force */
2018             fix1             = _mm_add_ps(fix1,tx);
2019             fiy1             = _mm_add_ps(fiy1,ty);
2020             fiz1             = _mm_add_ps(fiz1,tz);
2021
2022             fjx3             = _mm_add_ps(fjx3,tx);
2023             fjy3             = _mm_add_ps(fjy3,ty);
2024             fjz3             = _mm_add_ps(fjz3,tz);
2025             
2026             }
2027
2028             /**************************
2029              * CALCULATE INTERACTIONS *
2030              **************************/
2031
2032             if (gmx_mm_any_lt(rsq21,rcutoff2))
2033             {
2034
2035             r21              = _mm_mul_ps(rsq21,rinv21);
2036
2037             /* EWALD ELECTROSTATICS */
2038
2039             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2040             ewrt             = _mm_mul_ps(r21,ewtabscale);
2041             ewitab           = _mm_cvttps_epi32(ewrt);
2042             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2043             ewitab           = _mm_slli_epi32(ewitab,2);
2044             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2045             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2046             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2047             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2048             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2049             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2050             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2051             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2052             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2053
2054             d                = _mm_sub_ps(r21,rswitch);
2055             d                = _mm_max_ps(d,_mm_setzero_ps());
2056             d2               = _mm_mul_ps(d,d);
2057             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2058
2059             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2060
2061             /* Evaluate switch function */
2062             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2063             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2064             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2065
2066             fscal            = felec;
2067
2068             fscal            = _mm_and_ps(fscal,cutoff_mask);
2069
2070             /* Calculate temporary vectorial force */
2071             tx               = _mm_mul_ps(fscal,dx21);
2072             ty               = _mm_mul_ps(fscal,dy21);
2073             tz               = _mm_mul_ps(fscal,dz21);
2074
2075             /* Update vectorial force */
2076             fix2             = _mm_add_ps(fix2,tx);
2077             fiy2             = _mm_add_ps(fiy2,ty);
2078             fiz2             = _mm_add_ps(fiz2,tz);
2079
2080             fjx1             = _mm_add_ps(fjx1,tx);
2081             fjy1             = _mm_add_ps(fjy1,ty);
2082             fjz1             = _mm_add_ps(fjz1,tz);
2083             
2084             }
2085
2086             /**************************
2087              * CALCULATE INTERACTIONS *
2088              **************************/
2089
2090             if (gmx_mm_any_lt(rsq22,rcutoff2))
2091             {
2092
2093             r22              = _mm_mul_ps(rsq22,rinv22);
2094
2095             /* EWALD ELECTROSTATICS */
2096
2097             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098             ewrt             = _mm_mul_ps(r22,ewtabscale);
2099             ewitab           = _mm_cvttps_epi32(ewrt);
2100             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2101             ewitab           = _mm_slli_epi32(ewitab,2);
2102             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2103             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2104             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2105             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2106             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2107             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2108             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2109             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2110             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2111
2112             d                = _mm_sub_ps(r22,rswitch);
2113             d                = _mm_max_ps(d,_mm_setzero_ps());
2114             d2               = _mm_mul_ps(d,d);
2115             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2116
2117             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2118
2119             /* Evaluate switch function */
2120             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2121             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2122             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2123
2124             fscal            = felec;
2125
2126             fscal            = _mm_and_ps(fscal,cutoff_mask);
2127
2128             /* Calculate temporary vectorial force */
2129             tx               = _mm_mul_ps(fscal,dx22);
2130             ty               = _mm_mul_ps(fscal,dy22);
2131             tz               = _mm_mul_ps(fscal,dz22);
2132
2133             /* Update vectorial force */
2134             fix2             = _mm_add_ps(fix2,tx);
2135             fiy2             = _mm_add_ps(fiy2,ty);
2136             fiz2             = _mm_add_ps(fiz2,tz);
2137
2138             fjx2             = _mm_add_ps(fjx2,tx);
2139             fjy2             = _mm_add_ps(fjy2,ty);
2140             fjz2             = _mm_add_ps(fjz2,tz);
2141             
2142             }
2143
2144             /**************************
2145              * CALCULATE INTERACTIONS *
2146              **************************/
2147
2148             if (gmx_mm_any_lt(rsq23,rcutoff2))
2149             {
2150
2151             r23              = _mm_mul_ps(rsq23,rinv23);
2152
2153             /* EWALD ELECTROSTATICS */
2154
2155             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2156             ewrt             = _mm_mul_ps(r23,ewtabscale);
2157             ewitab           = _mm_cvttps_epi32(ewrt);
2158             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2159             ewitab           = _mm_slli_epi32(ewitab,2);
2160             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2161             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2162             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2163             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2164             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2165             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2166             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2167             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2168             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2169
2170             d                = _mm_sub_ps(r23,rswitch);
2171             d                = _mm_max_ps(d,_mm_setzero_ps());
2172             d2               = _mm_mul_ps(d,d);
2173             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2174
2175             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2176
2177             /* Evaluate switch function */
2178             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2179             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2180             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2181
2182             fscal            = felec;
2183
2184             fscal            = _mm_and_ps(fscal,cutoff_mask);
2185
2186             /* Calculate temporary vectorial force */
2187             tx               = _mm_mul_ps(fscal,dx23);
2188             ty               = _mm_mul_ps(fscal,dy23);
2189             tz               = _mm_mul_ps(fscal,dz23);
2190
2191             /* Update vectorial force */
2192             fix2             = _mm_add_ps(fix2,tx);
2193             fiy2             = _mm_add_ps(fiy2,ty);
2194             fiz2             = _mm_add_ps(fiz2,tz);
2195
2196             fjx3             = _mm_add_ps(fjx3,tx);
2197             fjy3             = _mm_add_ps(fjy3,ty);
2198             fjz3             = _mm_add_ps(fjz3,tz);
2199             
2200             }
2201
2202             /**************************
2203              * CALCULATE INTERACTIONS *
2204              **************************/
2205
2206             if (gmx_mm_any_lt(rsq31,rcutoff2))
2207             {
2208
2209             r31              = _mm_mul_ps(rsq31,rinv31);
2210
2211             /* EWALD ELECTROSTATICS */
2212
2213             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2214             ewrt             = _mm_mul_ps(r31,ewtabscale);
2215             ewitab           = _mm_cvttps_epi32(ewrt);
2216             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2217             ewitab           = _mm_slli_epi32(ewitab,2);
2218             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2219             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2220             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2221             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2222             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2223             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2224             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2225             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2226             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2227
2228             d                = _mm_sub_ps(r31,rswitch);
2229             d                = _mm_max_ps(d,_mm_setzero_ps());
2230             d2               = _mm_mul_ps(d,d);
2231             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2232
2233             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2234
2235             /* Evaluate switch function */
2236             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2237             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2238             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2239
2240             fscal            = felec;
2241
2242             fscal            = _mm_and_ps(fscal,cutoff_mask);
2243
2244             /* Calculate temporary vectorial force */
2245             tx               = _mm_mul_ps(fscal,dx31);
2246             ty               = _mm_mul_ps(fscal,dy31);
2247             tz               = _mm_mul_ps(fscal,dz31);
2248
2249             /* Update vectorial force */
2250             fix3             = _mm_add_ps(fix3,tx);
2251             fiy3             = _mm_add_ps(fiy3,ty);
2252             fiz3             = _mm_add_ps(fiz3,tz);
2253
2254             fjx1             = _mm_add_ps(fjx1,tx);
2255             fjy1             = _mm_add_ps(fjy1,ty);
2256             fjz1             = _mm_add_ps(fjz1,tz);
2257             
2258             }
2259
2260             /**************************
2261              * CALCULATE INTERACTIONS *
2262              **************************/
2263
2264             if (gmx_mm_any_lt(rsq32,rcutoff2))
2265             {
2266
2267             r32              = _mm_mul_ps(rsq32,rinv32);
2268
2269             /* EWALD ELECTROSTATICS */
2270
2271             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2272             ewrt             = _mm_mul_ps(r32,ewtabscale);
2273             ewitab           = _mm_cvttps_epi32(ewrt);
2274             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2275             ewitab           = _mm_slli_epi32(ewitab,2);
2276             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2277             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2278             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2279             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2280             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2281             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2282             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2283             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2284             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2285
2286             d                = _mm_sub_ps(r32,rswitch);
2287             d                = _mm_max_ps(d,_mm_setzero_ps());
2288             d2               = _mm_mul_ps(d,d);
2289             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2290
2291             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2292
2293             /* Evaluate switch function */
2294             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2295             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2296             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2297
2298             fscal            = felec;
2299
2300             fscal            = _mm_and_ps(fscal,cutoff_mask);
2301
2302             /* Calculate temporary vectorial force */
2303             tx               = _mm_mul_ps(fscal,dx32);
2304             ty               = _mm_mul_ps(fscal,dy32);
2305             tz               = _mm_mul_ps(fscal,dz32);
2306
2307             /* Update vectorial force */
2308             fix3             = _mm_add_ps(fix3,tx);
2309             fiy3             = _mm_add_ps(fiy3,ty);
2310             fiz3             = _mm_add_ps(fiz3,tz);
2311
2312             fjx2             = _mm_add_ps(fjx2,tx);
2313             fjy2             = _mm_add_ps(fjy2,ty);
2314             fjz2             = _mm_add_ps(fjz2,tz);
2315             
2316             }
2317
2318             /**************************
2319              * CALCULATE INTERACTIONS *
2320              **************************/
2321
2322             if (gmx_mm_any_lt(rsq33,rcutoff2))
2323             {
2324
2325             r33              = _mm_mul_ps(rsq33,rinv33);
2326
2327             /* EWALD ELECTROSTATICS */
2328
2329             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2330             ewrt             = _mm_mul_ps(r33,ewtabscale);
2331             ewitab           = _mm_cvttps_epi32(ewrt);
2332             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2333             ewitab           = _mm_slli_epi32(ewitab,2);
2334             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2335             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2336             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2337             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2338             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2339             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2340             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2341             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2342             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2343
2344             d                = _mm_sub_ps(r33,rswitch);
2345             d                = _mm_max_ps(d,_mm_setzero_ps());
2346             d2               = _mm_mul_ps(d,d);
2347             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2348
2349             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2350
2351             /* Evaluate switch function */
2352             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2353             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2354             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2355
2356             fscal            = felec;
2357
2358             fscal            = _mm_and_ps(fscal,cutoff_mask);
2359
2360             /* Calculate temporary vectorial force */
2361             tx               = _mm_mul_ps(fscal,dx33);
2362             ty               = _mm_mul_ps(fscal,dy33);
2363             tz               = _mm_mul_ps(fscal,dz33);
2364
2365             /* Update vectorial force */
2366             fix3             = _mm_add_ps(fix3,tx);
2367             fiy3             = _mm_add_ps(fiy3,ty);
2368             fiz3             = _mm_add_ps(fiz3,tz);
2369
2370             fjx3             = _mm_add_ps(fjx3,tx);
2371             fjy3             = _mm_add_ps(fjy3,ty);
2372             fjz3             = _mm_add_ps(fjz3,tz);
2373             
2374             }
2375
2376             fjptrA             = f+j_coord_offsetA;
2377             fjptrB             = f+j_coord_offsetB;
2378             fjptrC             = f+j_coord_offsetC;
2379             fjptrD             = f+j_coord_offsetD;
2380
2381             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2382                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2383
2384             /* Inner loop uses 558 flops */
2385         }
2386
2387         if(jidx<j_index_end)
2388         {
2389
2390             /* Get j neighbor index, and coordinate index */
2391             jnrlistA         = jjnr[jidx];
2392             jnrlistB         = jjnr[jidx+1];
2393             jnrlistC         = jjnr[jidx+2];
2394             jnrlistD         = jjnr[jidx+3];
2395             /* Sign of each element will be negative for non-real atoms.
2396              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2397              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2398              */
2399             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2400             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2401             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2402             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2403             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2404             j_coord_offsetA  = DIM*jnrA;
2405             j_coord_offsetB  = DIM*jnrB;
2406             j_coord_offsetC  = DIM*jnrC;
2407             j_coord_offsetD  = DIM*jnrD;
2408
2409             /* load j atom coordinates */
2410             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2411                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2412                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2413
2414             /* Calculate displacement vector */
2415             dx11             = _mm_sub_ps(ix1,jx1);
2416             dy11             = _mm_sub_ps(iy1,jy1);
2417             dz11             = _mm_sub_ps(iz1,jz1);
2418             dx12             = _mm_sub_ps(ix1,jx2);
2419             dy12             = _mm_sub_ps(iy1,jy2);
2420             dz12             = _mm_sub_ps(iz1,jz2);
2421             dx13             = _mm_sub_ps(ix1,jx3);
2422             dy13             = _mm_sub_ps(iy1,jy3);
2423             dz13             = _mm_sub_ps(iz1,jz3);
2424             dx21             = _mm_sub_ps(ix2,jx1);
2425             dy21             = _mm_sub_ps(iy2,jy1);
2426             dz21             = _mm_sub_ps(iz2,jz1);
2427             dx22             = _mm_sub_ps(ix2,jx2);
2428             dy22             = _mm_sub_ps(iy2,jy2);
2429             dz22             = _mm_sub_ps(iz2,jz2);
2430             dx23             = _mm_sub_ps(ix2,jx3);
2431             dy23             = _mm_sub_ps(iy2,jy3);
2432             dz23             = _mm_sub_ps(iz2,jz3);
2433             dx31             = _mm_sub_ps(ix3,jx1);
2434             dy31             = _mm_sub_ps(iy3,jy1);
2435             dz31             = _mm_sub_ps(iz3,jz1);
2436             dx32             = _mm_sub_ps(ix3,jx2);
2437             dy32             = _mm_sub_ps(iy3,jy2);
2438             dz32             = _mm_sub_ps(iz3,jz2);
2439             dx33             = _mm_sub_ps(ix3,jx3);
2440             dy33             = _mm_sub_ps(iy3,jy3);
2441             dz33             = _mm_sub_ps(iz3,jz3);
2442
2443             /* Calculate squared distance and things based on it */
2444             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2445             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2446             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2447             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2448             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2449             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2450             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2451             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2452             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2453
2454             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2455             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2456             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2457             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2458             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2459             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2460             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2461             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2462             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2463
2464             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2465             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2466             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2467             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2468             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2469             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2470             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2471             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2472             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2473
2474             fjx1             = _mm_setzero_ps();
2475             fjy1             = _mm_setzero_ps();
2476             fjz1             = _mm_setzero_ps();
2477             fjx2             = _mm_setzero_ps();
2478             fjy2             = _mm_setzero_ps();
2479             fjz2             = _mm_setzero_ps();
2480             fjx3             = _mm_setzero_ps();
2481             fjy3             = _mm_setzero_ps();
2482             fjz3             = _mm_setzero_ps();
2483
2484             /**************************
2485              * CALCULATE INTERACTIONS *
2486              **************************/
2487
2488             if (gmx_mm_any_lt(rsq11,rcutoff2))
2489             {
2490
2491             r11              = _mm_mul_ps(rsq11,rinv11);
2492             r11              = _mm_andnot_ps(dummy_mask,r11);
2493
2494             /* EWALD ELECTROSTATICS */
2495
2496             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2497             ewrt             = _mm_mul_ps(r11,ewtabscale);
2498             ewitab           = _mm_cvttps_epi32(ewrt);
2499             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2500             ewitab           = _mm_slli_epi32(ewitab,2);
2501             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2502             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2503             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2504             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2505             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2506             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2507             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2508             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2509             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2510
2511             d                = _mm_sub_ps(r11,rswitch);
2512             d                = _mm_max_ps(d,_mm_setzero_ps());
2513             d2               = _mm_mul_ps(d,d);
2514             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2515
2516             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2517
2518             /* Evaluate switch function */
2519             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2520             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2521             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2522
2523             fscal            = felec;
2524
2525             fscal            = _mm_and_ps(fscal,cutoff_mask);
2526
2527             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2528
2529             /* Calculate temporary vectorial force */
2530             tx               = _mm_mul_ps(fscal,dx11);
2531             ty               = _mm_mul_ps(fscal,dy11);
2532             tz               = _mm_mul_ps(fscal,dz11);
2533
2534             /* Update vectorial force */
2535             fix1             = _mm_add_ps(fix1,tx);
2536             fiy1             = _mm_add_ps(fiy1,ty);
2537             fiz1             = _mm_add_ps(fiz1,tz);
2538
2539             fjx1             = _mm_add_ps(fjx1,tx);
2540             fjy1             = _mm_add_ps(fjy1,ty);
2541             fjz1             = _mm_add_ps(fjz1,tz);
2542             
2543             }
2544
2545             /**************************
2546              * CALCULATE INTERACTIONS *
2547              **************************/
2548
2549             if (gmx_mm_any_lt(rsq12,rcutoff2))
2550             {
2551
2552             r12              = _mm_mul_ps(rsq12,rinv12);
2553             r12              = _mm_andnot_ps(dummy_mask,r12);
2554
2555             /* EWALD ELECTROSTATICS */
2556
2557             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2558             ewrt             = _mm_mul_ps(r12,ewtabscale);
2559             ewitab           = _mm_cvttps_epi32(ewrt);
2560             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2561             ewitab           = _mm_slli_epi32(ewitab,2);
2562             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2563             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2564             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2565             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2566             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2567             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2568             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2569             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2570             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2571
2572             d                = _mm_sub_ps(r12,rswitch);
2573             d                = _mm_max_ps(d,_mm_setzero_ps());
2574             d2               = _mm_mul_ps(d,d);
2575             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2576
2577             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2578
2579             /* Evaluate switch function */
2580             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2581             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2582             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2583
2584             fscal            = felec;
2585
2586             fscal            = _mm_and_ps(fscal,cutoff_mask);
2587
2588             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2589
2590             /* Calculate temporary vectorial force */
2591             tx               = _mm_mul_ps(fscal,dx12);
2592             ty               = _mm_mul_ps(fscal,dy12);
2593             tz               = _mm_mul_ps(fscal,dz12);
2594
2595             /* Update vectorial force */
2596             fix1             = _mm_add_ps(fix1,tx);
2597             fiy1             = _mm_add_ps(fiy1,ty);
2598             fiz1             = _mm_add_ps(fiz1,tz);
2599
2600             fjx2             = _mm_add_ps(fjx2,tx);
2601             fjy2             = _mm_add_ps(fjy2,ty);
2602             fjz2             = _mm_add_ps(fjz2,tz);
2603             
2604             }
2605
2606             /**************************
2607              * CALCULATE INTERACTIONS *
2608              **************************/
2609
2610             if (gmx_mm_any_lt(rsq13,rcutoff2))
2611             {
2612
2613             r13              = _mm_mul_ps(rsq13,rinv13);
2614             r13              = _mm_andnot_ps(dummy_mask,r13);
2615
2616             /* EWALD ELECTROSTATICS */
2617
2618             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2619             ewrt             = _mm_mul_ps(r13,ewtabscale);
2620             ewitab           = _mm_cvttps_epi32(ewrt);
2621             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2622             ewitab           = _mm_slli_epi32(ewitab,2);
2623             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2624             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2625             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2626             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2627             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2628             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2629             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2630             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2631             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2632
2633             d                = _mm_sub_ps(r13,rswitch);
2634             d                = _mm_max_ps(d,_mm_setzero_ps());
2635             d2               = _mm_mul_ps(d,d);
2636             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2637
2638             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2639
2640             /* Evaluate switch function */
2641             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2642             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2643             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2644
2645             fscal            = felec;
2646
2647             fscal            = _mm_and_ps(fscal,cutoff_mask);
2648
2649             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2650
2651             /* Calculate temporary vectorial force */
2652             tx               = _mm_mul_ps(fscal,dx13);
2653             ty               = _mm_mul_ps(fscal,dy13);
2654             tz               = _mm_mul_ps(fscal,dz13);
2655
2656             /* Update vectorial force */
2657             fix1             = _mm_add_ps(fix1,tx);
2658             fiy1             = _mm_add_ps(fiy1,ty);
2659             fiz1             = _mm_add_ps(fiz1,tz);
2660
2661             fjx3             = _mm_add_ps(fjx3,tx);
2662             fjy3             = _mm_add_ps(fjy3,ty);
2663             fjz3             = _mm_add_ps(fjz3,tz);
2664             
2665             }
2666
2667             /**************************
2668              * CALCULATE INTERACTIONS *
2669              **************************/
2670
2671             if (gmx_mm_any_lt(rsq21,rcutoff2))
2672             {
2673
2674             r21              = _mm_mul_ps(rsq21,rinv21);
2675             r21              = _mm_andnot_ps(dummy_mask,r21);
2676
2677             /* EWALD ELECTROSTATICS */
2678
2679             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2680             ewrt             = _mm_mul_ps(r21,ewtabscale);
2681             ewitab           = _mm_cvttps_epi32(ewrt);
2682             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2683             ewitab           = _mm_slli_epi32(ewitab,2);
2684             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2685             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2686             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2687             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2688             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2689             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2690             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2691             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2692             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2693
2694             d                = _mm_sub_ps(r21,rswitch);
2695             d                = _mm_max_ps(d,_mm_setzero_ps());
2696             d2               = _mm_mul_ps(d,d);
2697             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2698
2699             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2700
2701             /* Evaluate switch function */
2702             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2703             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2704             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2705
2706             fscal            = felec;
2707
2708             fscal            = _mm_and_ps(fscal,cutoff_mask);
2709
2710             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2711
2712             /* Calculate temporary vectorial force */
2713             tx               = _mm_mul_ps(fscal,dx21);
2714             ty               = _mm_mul_ps(fscal,dy21);
2715             tz               = _mm_mul_ps(fscal,dz21);
2716
2717             /* Update vectorial force */
2718             fix2             = _mm_add_ps(fix2,tx);
2719             fiy2             = _mm_add_ps(fiy2,ty);
2720             fiz2             = _mm_add_ps(fiz2,tz);
2721
2722             fjx1             = _mm_add_ps(fjx1,tx);
2723             fjy1             = _mm_add_ps(fjy1,ty);
2724             fjz1             = _mm_add_ps(fjz1,tz);
2725             
2726             }
2727
2728             /**************************
2729              * CALCULATE INTERACTIONS *
2730              **************************/
2731
2732             if (gmx_mm_any_lt(rsq22,rcutoff2))
2733             {
2734
2735             r22              = _mm_mul_ps(rsq22,rinv22);
2736             r22              = _mm_andnot_ps(dummy_mask,r22);
2737
2738             /* EWALD ELECTROSTATICS */
2739
2740             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2741             ewrt             = _mm_mul_ps(r22,ewtabscale);
2742             ewitab           = _mm_cvttps_epi32(ewrt);
2743             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2744             ewitab           = _mm_slli_epi32(ewitab,2);
2745             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2746             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2747             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2748             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2749             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2750             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2751             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2752             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2753             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2754
2755             d                = _mm_sub_ps(r22,rswitch);
2756             d                = _mm_max_ps(d,_mm_setzero_ps());
2757             d2               = _mm_mul_ps(d,d);
2758             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2759
2760             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2761
2762             /* Evaluate switch function */
2763             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2764             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2765             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2766
2767             fscal            = felec;
2768
2769             fscal            = _mm_and_ps(fscal,cutoff_mask);
2770
2771             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2772
2773             /* Calculate temporary vectorial force */
2774             tx               = _mm_mul_ps(fscal,dx22);
2775             ty               = _mm_mul_ps(fscal,dy22);
2776             tz               = _mm_mul_ps(fscal,dz22);
2777
2778             /* Update vectorial force */
2779             fix2             = _mm_add_ps(fix2,tx);
2780             fiy2             = _mm_add_ps(fiy2,ty);
2781             fiz2             = _mm_add_ps(fiz2,tz);
2782
2783             fjx2             = _mm_add_ps(fjx2,tx);
2784             fjy2             = _mm_add_ps(fjy2,ty);
2785             fjz2             = _mm_add_ps(fjz2,tz);
2786             
2787             }
2788
2789             /**************************
2790              * CALCULATE INTERACTIONS *
2791              **************************/
2792
2793             if (gmx_mm_any_lt(rsq23,rcutoff2))
2794             {
2795
2796             r23              = _mm_mul_ps(rsq23,rinv23);
2797             r23              = _mm_andnot_ps(dummy_mask,r23);
2798
2799             /* EWALD ELECTROSTATICS */
2800
2801             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2802             ewrt             = _mm_mul_ps(r23,ewtabscale);
2803             ewitab           = _mm_cvttps_epi32(ewrt);
2804             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2805             ewitab           = _mm_slli_epi32(ewitab,2);
2806             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2807             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2808             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2809             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2810             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2811             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2812             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2813             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2814             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2815
2816             d                = _mm_sub_ps(r23,rswitch);
2817             d                = _mm_max_ps(d,_mm_setzero_ps());
2818             d2               = _mm_mul_ps(d,d);
2819             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2820
2821             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2822
2823             /* Evaluate switch function */
2824             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2825             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2826             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2827
2828             fscal            = felec;
2829
2830             fscal            = _mm_and_ps(fscal,cutoff_mask);
2831
2832             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2833
2834             /* Calculate temporary vectorial force */
2835             tx               = _mm_mul_ps(fscal,dx23);
2836             ty               = _mm_mul_ps(fscal,dy23);
2837             tz               = _mm_mul_ps(fscal,dz23);
2838
2839             /* Update vectorial force */
2840             fix2             = _mm_add_ps(fix2,tx);
2841             fiy2             = _mm_add_ps(fiy2,ty);
2842             fiz2             = _mm_add_ps(fiz2,tz);
2843
2844             fjx3             = _mm_add_ps(fjx3,tx);
2845             fjy3             = _mm_add_ps(fjy3,ty);
2846             fjz3             = _mm_add_ps(fjz3,tz);
2847             
2848             }
2849
2850             /**************************
2851              * CALCULATE INTERACTIONS *
2852              **************************/
2853
2854             if (gmx_mm_any_lt(rsq31,rcutoff2))
2855             {
2856
2857             r31              = _mm_mul_ps(rsq31,rinv31);
2858             r31              = _mm_andnot_ps(dummy_mask,r31);
2859
2860             /* EWALD ELECTROSTATICS */
2861
2862             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2863             ewrt             = _mm_mul_ps(r31,ewtabscale);
2864             ewitab           = _mm_cvttps_epi32(ewrt);
2865             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2866             ewitab           = _mm_slli_epi32(ewitab,2);
2867             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2868             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2869             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2870             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2871             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2872             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2873             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2874             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2875             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2876
2877             d                = _mm_sub_ps(r31,rswitch);
2878             d                = _mm_max_ps(d,_mm_setzero_ps());
2879             d2               = _mm_mul_ps(d,d);
2880             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2881
2882             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2883
2884             /* Evaluate switch function */
2885             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2886             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2887             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2888
2889             fscal            = felec;
2890
2891             fscal            = _mm_and_ps(fscal,cutoff_mask);
2892
2893             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2894
2895             /* Calculate temporary vectorial force */
2896             tx               = _mm_mul_ps(fscal,dx31);
2897             ty               = _mm_mul_ps(fscal,dy31);
2898             tz               = _mm_mul_ps(fscal,dz31);
2899
2900             /* Update vectorial force */
2901             fix3             = _mm_add_ps(fix3,tx);
2902             fiy3             = _mm_add_ps(fiy3,ty);
2903             fiz3             = _mm_add_ps(fiz3,tz);
2904
2905             fjx1             = _mm_add_ps(fjx1,tx);
2906             fjy1             = _mm_add_ps(fjy1,ty);
2907             fjz1             = _mm_add_ps(fjz1,tz);
2908             
2909             }
2910
2911             /**************************
2912              * CALCULATE INTERACTIONS *
2913              **************************/
2914
2915             if (gmx_mm_any_lt(rsq32,rcutoff2))
2916             {
2917
2918             r32              = _mm_mul_ps(rsq32,rinv32);
2919             r32              = _mm_andnot_ps(dummy_mask,r32);
2920
2921             /* EWALD ELECTROSTATICS */
2922
2923             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2924             ewrt             = _mm_mul_ps(r32,ewtabscale);
2925             ewitab           = _mm_cvttps_epi32(ewrt);
2926             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2927             ewitab           = _mm_slli_epi32(ewitab,2);
2928             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2929             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2930             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2931             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2932             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2933             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2934             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2935             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2936             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2937
2938             d                = _mm_sub_ps(r32,rswitch);
2939             d                = _mm_max_ps(d,_mm_setzero_ps());
2940             d2               = _mm_mul_ps(d,d);
2941             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2942
2943             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2944
2945             /* Evaluate switch function */
2946             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2947             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2948             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2949
2950             fscal            = felec;
2951
2952             fscal            = _mm_and_ps(fscal,cutoff_mask);
2953
2954             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2955
2956             /* Calculate temporary vectorial force */
2957             tx               = _mm_mul_ps(fscal,dx32);
2958             ty               = _mm_mul_ps(fscal,dy32);
2959             tz               = _mm_mul_ps(fscal,dz32);
2960
2961             /* Update vectorial force */
2962             fix3             = _mm_add_ps(fix3,tx);
2963             fiy3             = _mm_add_ps(fiy3,ty);
2964             fiz3             = _mm_add_ps(fiz3,tz);
2965
2966             fjx2             = _mm_add_ps(fjx2,tx);
2967             fjy2             = _mm_add_ps(fjy2,ty);
2968             fjz2             = _mm_add_ps(fjz2,tz);
2969             
2970             }
2971
2972             /**************************
2973              * CALCULATE INTERACTIONS *
2974              **************************/
2975
2976             if (gmx_mm_any_lt(rsq33,rcutoff2))
2977             {
2978
2979             r33              = _mm_mul_ps(rsq33,rinv33);
2980             r33              = _mm_andnot_ps(dummy_mask,r33);
2981
2982             /* EWALD ELECTROSTATICS */
2983
2984             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2985             ewrt             = _mm_mul_ps(r33,ewtabscale);
2986             ewitab           = _mm_cvttps_epi32(ewrt);
2987             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2988             ewitab           = _mm_slli_epi32(ewitab,2);
2989             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2990             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2991             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2992             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2993             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2994             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2995             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2996             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2997             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2998
2999             d                = _mm_sub_ps(r33,rswitch);
3000             d                = _mm_max_ps(d,_mm_setzero_ps());
3001             d2               = _mm_mul_ps(d,d);
3002             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3003
3004             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3005
3006             /* Evaluate switch function */
3007             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3008             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3009             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
3010
3011             fscal            = felec;
3012
3013             fscal            = _mm_and_ps(fscal,cutoff_mask);
3014
3015             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3016
3017             /* Calculate temporary vectorial force */
3018             tx               = _mm_mul_ps(fscal,dx33);
3019             ty               = _mm_mul_ps(fscal,dy33);
3020             tz               = _mm_mul_ps(fscal,dz33);
3021
3022             /* Update vectorial force */
3023             fix3             = _mm_add_ps(fix3,tx);
3024             fiy3             = _mm_add_ps(fiy3,ty);
3025             fiz3             = _mm_add_ps(fiz3,tz);
3026
3027             fjx3             = _mm_add_ps(fjx3,tx);
3028             fjy3             = _mm_add_ps(fjy3,ty);
3029             fjz3             = _mm_add_ps(fjz3,tz);
3030             
3031             }
3032
3033             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3034             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3035             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3036             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3037
3038             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3039                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3040
3041             /* Inner loop uses 567 flops */
3042         }
3043
3044         /* End of innermost loop */
3045
3046         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3047                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
3048
3049         /* Increment number of inner iterations */
3050         inneriter                  += j_index_end - j_index_start;
3051
3052         /* Outer loop uses 18 flops */
3053     }
3054
3055     /* Increment number of outer iterations */
3056     outeriter        += nri;
3057
3058     /* Update outer/inner flops */
3059
3060     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);
3061 }