Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sse2_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
115     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
116     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
117     real             rswitch_scalar,d_scalar;
118     __m128           dummy_mask,cutoff_mask;
119     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
120     __m128           one     = _mm_set1_ps(1.0);
121     __m128           two     = _mm_set1_ps(2.0);
122     x                = xx[0];
123     f                = ff[0];
124
125     nri              = nlist->nri;
126     iinr             = nlist->iinr;
127     jindex           = nlist->jindex;
128     jjnr             = nlist->jjnr;
129     shiftidx         = nlist->shift;
130     gid              = nlist->gid;
131     shiftvec         = fr->shift_vec[0];
132     fshift           = fr->fshift[0];
133     facel            = _mm_set1_ps(fr->ic->epsfac);
134     charge           = mdatoms->chargeA;
135     krf              = _mm_set1_ps(fr->ic->k_rf);
136     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
137     crf              = _mm_set1_ps(fr->ic->c_rf);
138     nvdwtype         = fr->ntype;
139     vdwparam         = fr->nbfp;
140     vdwtype          = mdatoms->typeA;
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
145     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
146     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
147     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
148
149     jq1              = _mm_set1_ps(charge[inr+1]);
150     jq2              = _mm_set1_ps(charge[inr+2]);
151     jq3              = _mm_set1_ps(charge[inr+3]);
152     vdwjidx0A        = 2*vdwtype[inr+0];
153     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
154     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
155     qq11             = _mm_mul_ps(iq1,jq1);
156     qq12             = _mm_mul_ps(iq1,jq2);
157     qq13             = _mm_mul_ps(iq1,jq3);
158     qq21             = _mm_mul_ps(iq2,jq1);
159     qq22             = _mm_mul_ps(iq2,jq2);
160     qq23             = _mm_mul_ps(iq2,jq3);
161     qq31             = _mm_mul_ps(iq3,jq1);
162     qq32             = _mm_mul_ps(iq3,jq2);
163     qq33             = _mm_mul_ps(iq3,jq3);
164
165     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166     rcutoff_scalar   = fr->ic->rcoulomb;
167     rcutoff          = _mm_set1_ps(rcutoff_scalar);
168     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
169
170     rswitch_scalar   = fr->ic->rvdw_switch;
171     rswitch          = _mm_set1_ps(rswitch_scalar);
172     /* Setup switch parameters */
173     d_scalar         = rcutoff_scalar-rswitch_scalar;
174     d                = _mm_set1_ps(d_scalar);
175     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
176     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
178     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
179     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181
182     /* Avoid stupid compiler warnings */
183     jnrA = jnrB = jnrC = jnrD = 0;
184     j_coord_offsetA = 0;
185     j_coord_offsetB = 0;
186     j_coord_offsetC = 0;
187     j_coord_offsetD = 0;
188
189     outeriter        = 0;
190     inneriter        = 0;
191
192     for(iidx=0;iidx<4*DIM;iidx++)
193     {
194         scratch[iidx] = 0.0;
195     }  
196
197     /* Start outer loop over neighborlists */
198     for(iidx=0; iidx<nri; iidx++)
199     {
200         /* Load shift vector for this list */
201         i_shift_offset   = DIM*shiftidx[iidx];
202
203         /* Load limits for loop over neighbors */
204         j_index_start    = jindex[iidx];
205         j_index_end      = jindex[iidx+1];
206
207         /* Get outer coordinate index */
208         inr              = iinr[iidx];
209         i_coord_offset   = DIM*inr;
210
211         /* Load i particle coords and add shift vector */
212         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
213                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
214         
215         fix0             = _mm_setzero_ps();
216         fiy0             = _mm_setzero_ps();
217         fiz0             = _mm_setzero_ps();
218         fix1             = _mm_setzero_ps();
219         fiy1             = _mm_setzero_ps();
220         fiz1             = _mm_setzero_ps();
221         fix2             = _mm_setzero_ps();
222         fiy2             = _mm_setzero_ps();
223         fiz2             = _mm_setzero_ps();
224         fix3             = _mm_setzero_ps();
225         fiy3             = _mm_setzero_ps();
226         fiz3             = _mm_setzero_ps();
227
228         /* Reset potential sums */
229         velecsum         = _mm_setzero_ps();
230         vvdwsum          = _mm_setzero_ps();
231
232         /* Start inner kernel loop */
233         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
234         {
235
236             /* Get j neighbor index, and coordinate index */
237             jnrA             = jjnr[jidx];
238             jnrB             = jjnr[jidx+1];
239             jnrC             = jjnr[jidx+2];
240             jnrD             = jjnr[jidx+3];
241             j_coord_offsetA  = DIM*jnrA;
242             j_coord_offsetB  = DIM*jnrB;
243             j_coord_offsetC  = DIM*jnrC;
244             j_coord_offsetD  = DIM*jnrD;
245
246             /* load j atom coordinates */
247             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
248                                               x+j_coord_offsetC,x+j_coord_offsetD,
249                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
250                                               &jy2,&jz2,&jx3,&jy3,&jz3);
251
252             /* Calculate displacement vector */
253             dx00             = _mm_sub_ps(ix0,jx0);
254             dy00             = _mm_sub_ps(iy0,jy0);
255             dz00             = _mm_sub_ps(iz0,jz0);
256             dx11             = _mm_sub_ps(ix1,jx1);
257             dy11             = _mm_sub_ps(iy1,jy1);
258             dz11             = _mm_sub_ps(iz1,jz1);
259             dx12             = _mm_sub_ps(ix1,jx2);
260             dy12             = _mm_sub_ps(iy1,jy2);
261             dz12             = _mm_sub_ps(iz1,jz2);
262             dx13             = _mm_sub_ps(ix1,jx3);
263             dy13             = _mm_sub_ps(iy1,jy3);
264             dz13             = _mm_sub_ps(iz1,jz3);
265             dx21             = _mm_sub_ps(ix2,jx1);
266             dy21             = _mm_sub_ps(iy2,jy1);
267             dz21             = _mm_sub_ps(iz2,jz1);
268             dx22             = _mm_sub_ps(ix2,jx2);
269             dy22             = _mm_sub_ps(iy2,jy2);
270             dz22             = _mm_sub_ps(iz2,jz2);
271             dx23             = _mm_sub_ps(ix2,jx3);
272             dy23             = _mm_sub_ps(iy2,jy3);
273             dz23             = _mm_sub_ps(iz2,jz3);
274             dx31             = _mm_sub_ps(ix3,jx1);
275             dy31             = _mm_sub_ps(iy3,jy1);
276             dz31             = _mm_sub_ps(iz3,jz1);
277             dx32             = _mm_sub_ps(ix3,jx2);
278             dy32             = _mm_sub_ps(iy3,jy2);
279             dz32             = _mm_sub_ps(iz3,jz2);
280             dx33             = _mm_sub_ps(ix3,jx3);
281             dy33             = _mm_sub_ps(iy3,jy3);
282             dz33             = _mm_sub_ps(iz3,jz3);
283
284             /* Calculate squared distance and things based on it */
285             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
286             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
287             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
288             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
289             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
290             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
291             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
292             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
293             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
294             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
295
296             rinv00           = sse2_invsqrt_f(rsq00);
297             rinv11           = sse2_invsqrt_f(rsq11);
298             rinv12           = sse2_invsqrt_f(rsq12);
299             rinv13           = sse2_invsqrt_f(rsq13);
300             rinv21           = sse2_invsqrt_f(rsq21);
301             rinv22           = sse2_invsqrt_f(rsq22);
302             rinv23           = sse2_invsqrt_f(rsq23);
303             rinv31           = sse2_invsqrt_f(rsq31);
304             rinv32           = sse2_invsqrt_f(rsq32);
305             rinv33           = sse2_invsqrt_f(rsq33);
306
307             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
308             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
309             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
310             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
311             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
312             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
313             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
314             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
315             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
316             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
317
318             fjx0             = _mm_setzero_ps();
319             fjy0             = _mm_setzero_ps();
320             fjz0             = _mm_setzero_ps();
321             fjx1             = _mm_setzero_ps();
322             fjy1             = _mm_setzero_ps();
323             fjz1             = _mm_setzero_ps();
324             fjx2             = _mm_setzero_ps();
325             fjy2             = _mm_setzero_ps();
326             fjz2             = _mm_setzero_ps();
327             fjx3             = _mm_setzero_ps();
328             fjy3             = _mm_setzero_ps();
329             fjz3             = _mm_setzero_ps();
330
331             /**************************
332              * CALCULATE INTERACTIONS *
333              **************************/
334
335             if (gmx_mm_any_lt(rsq00,rcutoff2))
336             {
337
338             r00              = _mm_mul_ps(rsq00,rinv00);
339
340             /* LENNARD-JONES DISPERSION/REPULSION */
341
342             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
343             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
344             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
345             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
346             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
347
348             d                = _mm_sub_ps(r00,rswitch);
349             d                = _mm_max_ps(d,_mm_setzero_ps());
350             d2               = _mm_mul_ps(d,d);
351             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
352
353             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
354
355             /* Evaluate switch function */
356             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
357             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
358             vvdw             = _mm_mul_ps(vvdw,sw);
359             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
363             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
364
365             fscal            = fvdw;
366
367             fscal            = _mm_and_ps(fscal,cutoff_mask);
368
369             /* Calculate temporary vectorial force */
370             tx               = _mm_mul_ps(fscal,dx00);
371             ty               = _mm_mul_ps(fscal,dy00);
372             tz               = _mm_mul_ps(fscal,dz00);
373
374             /* Update vectorial force */
375             fix0             = _mm_add_ps(fix0,tx);
376             fiy0             = _mm_add_ps(fiy0,ty);
377             fiz0             = _mm_add_ps(fiz0,tz);
378
379             fjx0             = _mm_add_ps(fjx0,tx);
380             fjy0             = _mm_add_ps(fjy0,ty);
381             fjz0             = _mm_add_ps(fjz0,tz);
382             
383             }
384
385             /**************************
386              * CALCULATE INTERACTIONS *
387              **************************/
388
389             if (gmx_mm_any_lt(rsq11,rcutoff2))
390             {
391
392             /* REACTION-FIELD ELECTROSTATICS */
393             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
394             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
395
396             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
397
398             /* Update potential sum for this i atom from the interaction with this j atom. */
399             velec            = _mm_and_ps(velec,cutoff_mask);
400             velecsum         = _mm_add_ps(velecsum,velec);
401
402             fscal            = felec;
403
404             fscal            = _mm_and_ps(fscal,cutoff_mask);
405
406             /* Calculate temporary vectorial force */
407             tx               = _mm_mul_ps(fscal,dx11);
408             ty               = _mm_mul_ps(fscal,dy11);
409             tz               = _mm_mul_ps(fscal,dz11);
410
411             /* Update vectorial force */
412             fix1             = _mm_add_ps(fix1,tx);
413             fiy1             = _mm_add_ps(fiy1,ty);
414             fiz1             = _mm_add_ps(fiz1,tz);
415
416             fjx1             = _mm_add_ps(fjx1,tx);
417             fjy1             = _mm_add_ps(fjy1,ty);
418             fjz1             = _mm_add_ps(fjz1,tz);
419             
420             }
421
422             /**************************
423              * CALCULATE INTERACTIONS *
424              **************************/
425
426             if (gmx_mm_any_lt(rsq12,rcutoff2))
427             {
428
429             /* REACTION-FIELD ELECTROSTATICS */
430             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
431             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
432
433             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velec            = _mm_and_ps(velec,cutoff_mask);
437             velecsum         = _mm_add_ps(velecsum,velec);
438
439             fscal            = felec;
440
441             fscal            = _mm_and_ps(fscal,cutoff_mask);
442
443             /* Calculate temporary vectorial force */
444             tx               = _mm_mul_ps(fscal,dx12);
445             ty               = _mm_mul_ps(fscal,dy12);
446             tz               = _mm_mul_ps(fscal,dz12);
447
448             /* Update vectorial force */
449             fix1             = _mm_add_ps(fix1,tx);
450             fiy1             = _mm_add_ps(fiy1,ty);
451             fiz1             = _mm_add_ps(fiz1,tz);
452
453             fjx2             = _mm_add_ps(fjx2,tx);
454             fjy2             = _mm_add_ps(fjy2,ty);
455             fjz2             = _mm_add_ps(fjz2,tz);
456             
457             }
458
459             /**************************
460              * CALCULATE INTERACTIONS *
461              **************************/
462
463             if (gmx_mm_any_lt(rsq13,rcutoff2))
464             {
465
466             /* REACTION-FIELD ELECTROSTATICS */
467             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
468             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
469
470             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
471
472             /* Update potential sum for this i atom from the interaction with this j atom. */
473             velec            = _mm_and_ps(velec,cutoff_mask);
474             velecsum         = _mm_add_ps(velecsum,velec);
475
476             fscal            = felec;
477
478             fscal            = _mm_and_ps(fscal,cutoff_mask);
479
480             /* Calculate temporary vectorial force */
481             tx               = _mm_mul_ps(fscal,dx13);
482             ty               = _mm_mul_ps(fscal,dy13);
483             tz               = _mm_mul_ps(fscal,dz13);
484
485             /* Update vectorial force */
486             fix1             = _mm_add_ps(fix1,tx);
487             fiy1             = _mm_add_ps(fiy1,ty);
488             fiz1             = _mm_add_ps(fiz1,tz);
489
490             fjx3             = _mm_add_ps(fjx3,tx);
491             fjy3             = _mm_add_ps(fjy3,ty);
492             fjz3             = _mm_add_ps(fjz3,tz);
493             
494             }
495
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             if (gmx_mm_any_lt(rsq21,rcutoff2))
501             {
502
503             /* REACTION-FIELD ELECTROSTATICS */
504             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
505             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
506
507             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velec            = _mm_and_ps(velec,cutoff_mask);
511             velecsum         = _mm_add_ps(velecsum,velec);
512
513             fscal            = felec;
514
515             fscal            = _mm_and_ps(fscal,cutoff_mask);
516
517             /* Calculate temporary vectorial force */
518             tx               = _mm_mul_ps(fscal,dx21);
519             ty               = _mm_mul_ps(fscal,dy21);
520             tz               = _mm_mul_ps(fscal,dz21);
521
522             /* Update vectorial force */
523             fix2             = _mm_add_ps(fix2,tx);
524             fiy2             = _mm_add_ps(fiy2,ty);
525             fiz2             = _mm_add_ps(fiz2,tz);
526
527             fjx1             = _mm_add_ps(fjx1,tx);
528             fjy1             = _mm_add_ps(fjy1,ty);
529             fjz1             = _mm_add_ps(fjz1,tz);
530             
531             }
532
533             /**************************
534              * CALCULATE INTERACTIONS *
535              **************************/
536
537             if (gmx_mm_any_lt(rsq22,rcutoff2))
538             {
539
540             /* REACTION-FIELD ELECTROSTATICS */
541             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
542             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
543
544             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
545
546             /* Update potential sum for this i atom from the interaction with this j atom. */
547             velec            = _mm_and_ps(velec,cutoff_mask);
548             velecsum         = _mm_add_ps(velecsum,velec);
549
550             fscal            = felec;
551
552             fscal            = _mm_and_ps(fscal,cutoff_mask);
553
554             /* Calculate temporary vectorial force */
555             tx               = _mm_mul_ps(fscal,dx22);
556             ty               = _mm_mul_ps(fscal,dy22);
557             tz               = _mm_mul_ps(fscal,dz22);
558
559             /* Update vectorial force */
560             fix2             = _mm_add_ps(fix2,tx);
561             fiy2             = _mm_add_ps(fiy2,ty);
562             fiz2             = _mm_add_ps(fiz2,tz);
563
564             fjx2             = _mm_add_ps(fjx2,tx);
565             fjy2             = _mm_add_ps(fjy2,ty);
566             fjz2             = _mm_add_ps(fjz2,tz);
567             
568             }
569
570             /**************************
571              * CALCULATE INTERACTIONS *
572              **************************/
573
574             if (gmx_mm_any_lt(rsq23,rcutoff2))
575             {
576
577             /* REACTION-FIELD ELECTROSTATICS */
578             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
579             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
580
581             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
582
583             /* Update potential sum for this i atom from the interaction with this j atom. */
584             velec            = _mm_and_ps(velec,cutoff_mask);
585             velecsum         = _mm_add_ps(velecsum,velec);
586
587             fscal            = felec;
588
589             fscal            = _mm_and_ps(fscal,cutoff_mask);
590
591             /* Calculate temporary vectorial force */
592             tx               = _mm_mul_ps(fscal,dx23);
593             ty               = _mm_mul_ps(fscal,dy23);
594             tz               = _mm_mul_ps(fscal,dz23);
595
596             /* Update vectorial force */
597             fix2             = _mm_add_ps(fix2,tx);
598             fiy2             = _mm_add_ps(fiy2,ty);
599             fiz2             = _mm_add_ps(fiz2,tz);
600
601             fjx3             = _mm_add_ps(fjx3,tx);
602             fjy3             = _mm_add_ps(fjy3,ty);
603             fjz3             = _mm_add_ps(fjz3,tz);
604             
605             }
606
607             /**************************
608              * CALCULATE INTERACTIONS *
609              **************************/
610
611             if (gmx_mm_any_lt(rsq31,rcutoff2))
612             {
613
614             /* REACTION-FIELD ELECTROSTATICS */
615             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
616             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
617
618             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
619
620             /* Update potential sum for this i atom from the interaction with this j atom. */
621             velec            = _mm_and_ps(velec,cutoff_mask);
622             velecsum         = _mm_add_ps(velecsum,velec);
623
624             fscal            = felec;
625
626             fscal            = _mm_and_ps(fscal,cutoff_mask);
627
628             /* Calculate temporary vectorial force */
629             tx               = _mm_mul_ps(fscal,dx31);
630             ty               = _mm_mul_ps(fscal,dy31);
631             tz               = _mm_mul_ps(fscal,dz31);
632
633             /* Update vectorial force */
634             fix3             = _mm_add_ps(fix3,tx);
635             fiy3             = _mm_add_ps(fiy3,ty);
636             fiz3             = _mm_add_ps(fiz3,tz);
637
638             fjx1             = _mm_add_ps(fjx1,tx);
639             fjy1             = _mm_add_ps(fjy1,ty);
640             fjz1             = _mm_add_ps(fjz1,tz);
641             
642             }
643
644             /**************************
645              * CALCULATE INTERACTIONS *
646              **************************/
647
648             if (gmx_mm_any_lt(rsq32,rcutoff2))
649             {
650
651             /* REACTION-FIELD ELECTROSTATICS */
652             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
653             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
654
655             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
656
657             /* Update potential sum for this i atom from the interaction with this j atom. */
658             velec            = _mm_and_ps(velec,cutoff_mask);
659             velecsum         = _mm_add_ps(velecsum,velec);
660
661             fscal            = felec;
662
663             fscal            = _mm_and_ps(fscal,cutoff_mask);
664
665             /* Calculate temporary vectorial force */
666             tx               = _mm_mul_ps(fscal,dx32);
667             ty               = _mm_mul_ps(fscal,dy32);
668             tz               = _mm_mul_ps(fscal,dz32);
669
670             /* Update vectorial force */
671             fix3             = _mm_add_ps(fix3,tx);
672             fiy3             = _mm_add_ps(fiy3,ty);
673             fiz3             = _mm_add_ps(fiz3,tz);
674
675             fjx2             = _mm_add_ps(fjx2,tx);
676             fjy2             = _mm_add_ps(fjy2,ty);
677             fjz2             = _mm_add_ps(fjz2,tz);
678             
679             }
680
681             /**************************
682              * CALCULATE INTERACTIONS *
683              **************************/
684
685             if (gmx_mm_any_lt(rsq33,rcutoff2))
686             {
687
688             /* REACTION-FIELD ELECTROSTATICS */
689             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
690             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
691
692             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
693
694             /* Update potential sum for this i atom from the interaction with this j atom. */
695             velec            = _mm_and_ps(velec,cutoff_mask);
696             velecsum         = _mm_add_ps(velecsum,velec);
697
698             fscal            = felec;
699
700             fscal            = _mm_and_ps(fscal,cutoff_mask);
701
702             /* Calculate temporary vectorial force */
703             tx               = _mm_mul_ps(fscal,dx33);
704             ty               = _mm_mul_ps(fscal,dy33);
705             tz               = _mm_mul_ps(fscal,dz33);
706
707             /* Update vectorial force */
708             fix3             = _mm_add_ps(fix3,tx);
709             fiy3             = _mm_add_ps(fiy3,ty);
710             fiz3             = _mm_add_ps(fiz3,tz);
711
712             fjx3             = _mm_add_ps(fjx3,tx);
713             fjy3             = _mm_add_ps(fjy3,ty);
714             fjz3             = _mm_add_ps(fjz3,tz);
715             
716             }
717
718             fjptrA             = f+j_coord_offsetA;
719             fjptrB             = f+j_coord_offsetB;
720             fjptrC             = f+j_coord_offsetC;
721             fjptrD             = f+j_coord_offsetD;
722
723             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
724                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
725                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
726
727             /* Inner loop uses 386 flops */
728         }
729
730         if(jidx<j_index_end)
731         {
732
733             /* Get j neighbor index, and coordinate index */
734             jnrlistA         = jjnr[jidx];
735             jnrlistB         = jjnr[jidx+1];
736             jnrlistC         = jjnr[jidx+2];
737             jnrlistD         = jjnr[jidx+3];
738             /* Sign of each element will be negative for non-real atoms.
739              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
740              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
741              */
742             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
743             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
744             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
745             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
746             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
747             j_coord_offsetA  = DIM*jnrA;
748             j_coord_offsetB  = DIM*jnrB;
749             j_coord_offsetC  = DIM*jnrC;
750             j_coord_offsetD  = DIM*jnrD;
751
752             /* load j atom coordinates */
753             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
754                                               x+j_coord_offsetC,x+j_coord_offsetD,
755                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
756                                               &jy2,&jz2,&jx3,&jy3,&jz3);
757
758             /* Calculate displacement vector */
759             dx00             = _mm_sub_ps(ix0,jx0);
760             dy00             = _mm_sub_ps(iy0,jy0);
761             dz00             = _mm_sub_ps(iz0,jz0);
762             dx11             = _mm_sub_ps(ix1,jx1);
763             dy11             = _mm_sub_ps(iy1,jy1);
764             dz11             = _mm_sub_ps(iz1,jz1);
765             dx12             = _mm_sub_ps(ix1,jx2);
766             dy12             = _mm_sub_ps(iy1,jy2);
767             dz12             = _mm_sub_ps(iz1,jz2);
768             dx13             = _mm_sub_ps(ix1,jx3);
769             dy13             = _mm_sub_ps(iy1,jy3);
770             dz13             = _mm_sub_ps(iz1,jz3);
771             dx21             = _mm_sub_ps(ix2,jx1);
772             dy21             = _mm_sub_ps(iy2,jy1);
773             dz21             = _mm_sub_ps(iz2,jz1);
774             dx22             = _mm_sub_ps(ix2,jx2);
775             dy22             = _mm_sub_ps(iy2,jy2);
776             dz22             = _mm_sub_ps(iz2,jz2);
777             dx23             = _mm_sub_ps(ix2,jx3);
778             dy23             = _mm_sub_ps(iy2,jy3);
779             dz23             = _mm_sub_ps(iz2,jz3);
780             dx31             = _mm_sub_ps(ix3,jx1);
781             dy31             = _mm_sub_ps(iy3,jy1);
782             dz31             = _mm_sub_ps(iz3,jz1);
783             dx32             = _mm_sub_ps(ix3,jx2);
784             dy32             = _mm_sub_ps(iy3,jy2);
785             dz32             = _mm_sub_ps(iz3,jz2);
786             dx33             = _mm_sub_ps(ix3,jx3);
787             dy33             = _mm_sub_ps(iy3,jy3);
788             dz33             = _mm_sub_ps(iz3,jz3);
789
790             /* Calculate squared distance and things based on it */
791             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
792             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
793             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
794             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
795             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
796             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
797             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
798             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
799             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
800             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
801
802             rinv00           = sse2_invsqrt_f(rsq00);
803             rinv11           = sse2_invsqrt_f(rsq11);
804             rinv12           = sse2_invsqrt_f(rsq12);
805             rinv13           = sse2_invsqrt_f(rsq13);
806             rinv21           = sse2_invsqrt_f(rsq21);
807             rinv22           = sse2_invsqrt_f(rsq22);
808             rinv23           = sse2_invsqrt_f(rsq23);
809             rinv31           = sse2_invsqrt_f(rsq31);
810             rinv32           = sse2_invsqrt_f(rsq32);
811             rinv33           = sse2_invsqrt_f(rsq33);
812
813             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
814             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
815             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
816             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
817             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
818             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
819             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
820             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
821             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
822             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
823
824             fjx0             = _mm_setzero_ps();
825             fjy0             = _mm_setzero_ps();
826             fjz0             = _mm_setzero_ps();
827             fjx1             = _mm_setzero_ps();
828             fjy1             = _mm_setzero_ps();
829             fjz1             = _mm_setzero_ps();
830             fjx2             = _mm_setzero_ps();
831             fjy2             = _mm_setzero_ps();
832             fjz2             = _mm_setzero_ps();
833             fjx3             = _mm_setzero_ps();
834             fjy3             = _mm_setzero_ps();
835             fjz3             = _mm_setzero_ps();
836
837             /**************************
838              * CALCULATE INTERACTIONS *
839              **************************/
840
841             if (gmx_mm_any_lt(rsq00,rcutoff2))
842             {
843
844             r00              = _mm_mul_ps(rsq00,rinv00);
845             r00              = _mm_andnot_ps(dummy_mask,r00);
846
847             /* LENNARD-JONES DISPERSION/REPULSION */
848
849             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
850             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
851             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
852             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
853             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
854
855             d                = _mm_sub_ps(r00,rswitch);
856             d                = _mm_max_ps(d,_mm_setzero_ps());
857             d2               = _mm_mul_ps(d,d);
858             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
859
860             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
861
862             /* Evaluate switch function */
863             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
864             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
865             vvdw             = _mm_mul_ps(vvdw,sw);
866             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
867
868             /* Update potential sum for this i atom from the interaction with this j atom. */
869             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
870             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
871             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
872
873             fscal            = fvdw;
874
875             fscal            = _mm_and_ps(fscal,cutoff_mask);
876
877             fscal            = _mm_andnot_ps(dummy_mask,fscal);
878
879             /* Calculate temporary vectorial force */
880             tx               = _mm_mul_ps(fscal,dx00);
881             ty               = _mm_mul_ps(fscal,dy00);
882             tz               = _mm_mul_ps(fscal,dz00);
883
884             /* Update vectorial force */
885             fix0             = _mm_add_ps(fix0,tx);
886             fiy0             = _mm_add_ps(fiy0,ty);
887             fiz0             = _mm_add_ps(fiz0,tz);
888
889             fjx0             = _mm_add_ps(fjx0,tx);
890             fjy0             = _mm_add_ps(fjy0,ty);
891             fjz0             = _mm_add_ps(fjz0,tz);
892             
893             }
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             if (gmx_mm_any_lt(rsq11,rcutoff2))
900             {
901
902             /* REACTION-FIELD ELECTROSTATICS */
903             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
904             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
905
906             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
907
908             /* Update potential sum for this i atom from the interaction with this j atom. */
909             velec            = _mm_and_ps(velec,cutoff_mask);
910             velec            = _mm_andnot_ps(dummy_mask,velec);
911             velecsum         = _mm_add_ps(velecsum,velec);
912
913             fscal            = felec;
914
915             fscal            = _mm_and_ps(fscal,cutoff_mask);
916
917             fscal            = _mm_andnot_ps(dummy_mask,fscal);
918
919             /* Calculate temporary vectorial force */
920             tx               = _mm_mul_ps(fscal,dx11);
921             ty               = _mm_mul_ps(fscal,dy11);
922             tz               = _mm_mul_ps(fscal,dz11);
923
924             /* Update vectorial force */
925             fix1             = _mm_add_ps(fix1,tx);
926             fiy1             = _mm_add_ps(fiy1,ty);
927             fiz1             = _mm_add_ps(fiz1,tz);
928
929             fjx1             = _mm_add_ps(fjx1,tx);
930             fjy1             = _mm_add_ps(fjy1,ty);
931             fjz1             = _mm_add_ps(fjz1,tz);
932             
933             }
934
935             /**************************
936              * CALCULATE INTERACTIONS *
937              **************************/
938
939             if (gmx_mm_any_lt(rsq12,rcutoff2))
940             {
941
942             /* REACTION-FIELD ELECTROSTATICS */
943             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
944             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
945
946             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
947
948             /* Update potential sum for this i atom from the interaction with this j atom. */
949             velec            = _mm_and_ps(velec,cutoff_mask);
950             velec            = _mm_andnot_ps(dummy_mask,velec);
951             velecsum         = _mm_add_ps(velecsum,velec);
952
953             fscal            = felec;
954
955             fscal            = _mm_and_ps(fscal,cutoff_mask);
956
957             fscal            = _mm_andnot_ps(dummy_mask,fscal);
958
959             /* Calculate temporary vectorial force */
960             tx               = _mm_mul_ps(fscal,dx12);
961             ty               = _mm_mul_ps(fscal,dy12);
962             tz               = _mm_mul_ps(fscal,dz12);
963
964             /* Update vectorial force */
965             fix1             = _mm_add_ps(fix1,tx);
966             fiy1             = _mm_add_ps(fiy1,ty);
967             fiz1             = _mm_add_ps(fiz1,tz);
968
969             fjx2             = _mm_add_ps(fjx2,tx);
970             fjy2             = _mm_add_ps(fjy2,ty);
971             fjz2             = _mm_add_ps(fjz2,tz);
972             
973             }
974
975             /**************************
976              * CALCULATE INTERACTIONS *
977              **************************/
978
979             if (gmx_mm_any_lt(rsq13,rcutoff2))
980             {
981
982             /* REACTION-FIELD ELECTROSTATICS */
983             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
984             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
985
986             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
987
988             /* Update potential sum for this i atom from the interaction with this j atom. */
989             velec            = _mm_and_ps(velec,cutoff_mask);
990             velec            = _mm_andnot_ps(dummy_mask,velec);
991             velecsum         = _mm_add_ps(velecsum,velec);
992
993             fscal            = felec;
994
995             fscal            = _mm_and_ps(fscal,cutoff_mask);
996
997             fscal            = _mm_andnot_ps(dummy_mask,fscal);
998
999             /* Calculate temporary vectorial force */
1000             tx               = _mm_mul_ps(fscal,dx13);
1001             ty               = _mm_mul_ps(fscal,dy13);
1002             tz               = _mm_mul_ps(fscal,dz13);
1003
1004             /* Update vectorial force */
1005             fix1             = _mm_add_ps(fix1,tx);
1006             fiy1             = _mm_add_ps(fiy1,ty);
1007             fiz1             = _mm_add_ps(fiz1,tz);
1008
1009             fjx3             = _mm_add_ps(fjx3,tx);
1010             fjy3             = _mm_add_ps(fjy3,ty);
1011             fjz3             = _mm_add_ps(fjz3,tz);
1012             
1013             }
1014
1015             /**************************
1016              * CALCULATE INTERACTIONS *
1017              **************************/
1018
1019             if (gmx_mm_any_lt(rsq21,rcutoff2))
1020             {
1021
1022             /* REACTION-FIELD ELECTROSTATICS */
1023             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1024             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1025
1026             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1027
1028             /* Update potential sum for this i atom from the interaction with this j atom. */
1029             velec            = _mm_and_ps(velec,cutoff_mask);
1030             velec            = _mm_andnot_ps(dummy_mask,velec);
1031             velecsum         = _mm_add_ps(velecsum,velec);
1032
1033             fscal            = felec;
1034
1035             fscal            = _mm_and_ps(fscal,cutoff_mask);
1036
1037             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1038
1039             /* Calculate temporary vectorial force */
1040             tx               = _mm_mul_ps(fscal,dx21);
1041             ty               = _mm_mul_ps(fscal,dy21);
1042             tz               = _mm_mul_ps(fscal,dz21);
1043
1044             /* Update vectorial force */
1045             fix2             = _mm_add_ps(fix2,tx);
1046             fiy2             = _mm_add_ps(fiy2,ty);
1047             fiz2             = _mm_add_ps(fiz2,tz);
1048
1049             fjx1             = _mm_add_ps(fjx1,tx);
1050             fjy1             = _mm_add_ps(fjy1,ty);
1051             fjz1             = _mm_add_ps(fjz1,tz);
1052             
1053             }
1054
1055             /**************************
1056              * CALCULATE INTERACTIONS *
1057              **************************/
1058
1059             if (gmx_mm_any_lt(rsq22,rcutoff2))
1060             {
1061
1062             /* REACTION-FIELD ELECTROSTATICS */
1063             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1064             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1065
1066             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1067
1068             /* Update potential sum for this i atom from the interaction with this j atom. */
1069             velec            = _mm_and_ps(velec,cutoff_mask);
1070             velec            = _mm_andnot_ps(dummy_mask,velec);
1071             velecsum         = _mm_add_ps(velecsum,velec);
1072
1073             fscal            = felec;
1074
1075             fscal            = _mm_and_ps(fscal,cutoff_mask);
1076
1077             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1078
1079             /* Calculate temporary vectorial force */
1080             tx               = _mm_mul_ps(fscal,dx22);
1081             ty               = _mm_mul_ps(fscal,dy22);
1082             tz               = _mm_mul_ps(fscal,dz22);
1083
1084             /* Update vectorial force */
1085             fix2             = _mm_add_ps(fix2,tx);
1086             fiy2             = _mm_add_ps(fiy2,ty);
1087             fiz2             = _mm_add_ps(fiz2,tz);
1088
1089             fjx2             = _mm_add_ps(fjx2,tx);
1090             fjy2             = _mm_add_ps(fjy2,ty);
1091             fjz2             = _mm_add_ps(fjz2,tz);
1092             
1093             }
1094
1095             /**************************
1096              * CALCULATE INTERACTIONS *
1097              **************************/
1098
1099             if (gmx_mm_any_lt(rsq23,rcutoff2))
1100             {
1101
1102             /* REACTION-FIELD ELECTROSTATICS */
1103             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1104             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1105
1106             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1107
1108             /* Update potential sum for this i atom from the interaction with this j atom. */
1109             velec            = _mm_and_ps(velec,cutoff_mask);
1110             velec            = _mm_andnot_ps(dummy_mask,velec);
1111             velecsum         = _mm_add_ps(velecsum,velec);
1112
1113             fscal            = felec;
1114
1115             fscal            = _mm_and_ps(fscal,cutoff_mask);
1116
1117             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1118
1119             /* Calculate temporary vectorial force */
1120             tx               = _mm_mul_ps(fscal,dx23);
1121             ty               = _mm_mul_ps(fscal,dy23);
1122             tz               = _mm_mul_ps(fscal,dz23);
1123
1124             /* Update vectorial force */
1125             fix2             = _mm_add_ps(fix2,tx);
1126             fiy2             = _mm_add_ps(fiy2,ty);
1127             fiz2             = _mm_add_ps(fiz2,tz);
1128
1129             fjx3             = _mm_add_ps(fjx3,tx);
1130             fjy3             = _mm_add_ps(fjy3,ty);
1131             fjz3             = _mm_add_ps(fjz3,tz);
1132             
1133             }
1134
1135             /**************************
1136              * CALCULATE INTERACTIONS *
1137              **************************/
1138
1139             if (gmx_mm_any_lt(rsq31,rcutoff2))
1140             {
1141
1142             /* REACTION-FIELD ELECTROSTATICS */
1143             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1144             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1145
1146             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1147
1148             /* Update potential sum for this i atom from the interaction with this j atom. */
1149             velec            = _mm_and_ps(velec,cutoff_mask);
1150             velec            = _mm_andnot_ps(dummy_mask,velec);
1151             velecsum         = _mm_add_ps(velecsum,velec);
1152
1153             fscal            = felec;
1154
1155             fscal            = _mm_and_ps(fscal,cutoff_mask);
1156
1157             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1158
1159             /* Calculate temporary vectorial force */
1160             tx               = _mm_mul_ps(fscal,dx31);
1161             ty               = _mm_mul_ps(fscal,dy31);
1162             tz               = _mm_mul_ps(fscal,dz31);
1163
1164             /* Update vectorial force */
1165             fix3             = _mm_add_ps(fix3,tx);
1166             fiy3             = _mm_add_ps(fiy3,ty);
1167             fiz3             = _mm_add_ps(fiz3,tz);
1168
1169             fjx1             = _mm_add_ps(fjx1,tx);
1170             fjy1             = _mm_add_ps(fjy1,ty);
1171             fjz1             = _mm_add_ps(fjz1,tz);
1172             
1173             }
1174
1175             /**************************
1176              * CALCULATE INTERACTIONS *
1177              **************************/
1178
1179             if (gmx_mm_any_lt(rsq32,rcutoff2))
1180             {
1181
1182             /* REACTION-FIELD ELECTROSTATICS */
1183             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1184             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1185
1186             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1187
1188             /* Update potential sum for this i atom from the interaction with this j atom. */
1189             velec            = _mm_and_ps(velec,cutoff_mask);
1190             velec            = _mm_andnot_ps(dummy_mask,velec);
1191             velecsum         = _mm_add_ps(velecsum,velec);
1192
1193             fscal            = felec;
1194
1195             fscal            = _mm_and_ps(fscal,cutoff_mask);
1196
1197             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1198
1199             /* Calculate temporary vectorial force */
1200             tx               = _mm_mul_ps(fscal,dx32);
1201             ty               = _mm_mul_ps(fscal,dy32);
1202             tz               = _mm_mul_ps(fscal,dz32);
1203
1204             /* Update vectorial force */
1205             fix3             = _mm_add_ps(fix3,tx);
1206             fiy3             = _mm_add_ps(fiy3,ty);
1207             fiz3             = _mm_add_ps(fiz3,tz);
1208
1209             fjx2             = _mm_add_ps(fjx2,tx);
1210             fjy2             = _mm_add_ps(fjy2,ty);
1211             fjz2             = _mm_add_ps(fjz2,tz);
1212             
1213             }
1214
1215             /**************************
1216              * CALCULATE INTERACTIONS *
1217              **************************/
1218
1219             if (gmx_mm_any_lt(rsq33,rcutoff2))
1220             {
1221
1222             /* REACTION-FIELD ELECTROSTATICS */
1223             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1224             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1225
1226             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1227
1228             /* Update potential sum for this i atom from the interaction with this j atom. */
1229             velec            = _mm_and_ps(velec,cutoff_mask);
1230             velec            = _mm_andnot_ps(dummy_mask,velec);
1231             velecsum         = _mm_add_ps(velecsum,velec);
1232
1233             fscal            = felec;
1234
1235             fscal            = _mm_and_ps(fscal,cutoff_mask);
1236
1237             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1238
1239             /* Calculate temporary vectorial force */
1240             tx               = _mm_mul_ps(fscal,dx33);
1241             ty               = _mm_mul_ps(fscal,dy33);
1242             tz               = _mm_mul_ps(fscal,dz33);
1243
1244             /* Update vectorial force */
1245             fix3             = _mm_add_ps(fix3,tx);
1246             fiy3             = _mm_add_ps(fiy3,ty);
1247             fiz3             = _mm_add_ps(fiz3,tz);
1248
1249             fjx3             = _mm_add_ps(fjx3,tx);
1250             fjy3             = _mm_add_ps(fjy3,ty);
1251             fjz3             = _mm_add_ps(fjz3,tz);
1252             
1253             }
1254
1255             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1256             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1257             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1258             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1259
1260             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1261                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1262                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1263
1264             /* Inner loop uses 387 flops */
1265         }
1266
1267         /* End of innermost loop */
1268
1269         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1270                                               f+i_coord_offset,fshift+i_shift_offset);
1271
1272         ggid                        = gid[iidx];
1273         /* Update potential energies */
1274         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1275         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1276
1277         /* Increment number of inner iterations */
1278         inneriter                  += j_index_end - j_index_start;
1279
1280         /* Outer loop uses 26 flops */
1281     }
1282
1283     /* Increment number of outer iterations */
1284     outeriter        += nri;
1285
1286     /* Update outer/inner flops */
1287
1288     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
1289 }
1290 /*
1291  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
1292  * Electrostatics interaction: ReactionField
1293  * VdW interaction:            LennardJones
1294  * Geometry:                   Water4-Water4
1295  * Calculate force/pot:        Force
1296  */
1297 void
1298 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
1299                     (t_nblist                    * gmx_restrict       nlist,
1300                      rvec                        * gmx_restrict          xx,
1301                      rvec                        * gmx_restrict          ff,
1302                      struct t_forcerec           * gmx_restrict          fr,
1303                      t_mdatoms                   * gmx_restrict     mdatoms,
1304                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1305                      t_nrnb                      * gmx_restrict        nrnb)
1306 {
1307     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1308      * just 0 for non-waters.
1309      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1310      * jnr indices corresponding to data put in the four positions in the SIMD register.
1311      */
1312     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1313     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1314     int              jnrA,jnrB,jnrC,jnrD;
1315     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1316     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1317     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1318     real             rcutoff_scalar;
1319     real             *shiftvec,*fshift,*x,*f;
1320     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1321     real             scratch[4*DIM];
1322     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1323     int              vdwioffset0;
1324     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1325     int              vdwioffset1;
1326     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1327     int              vdwioffset2;
1328     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1329     int              vdwioffset3;
1330     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1331     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1332     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1333     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1334     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1335     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1336     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1337     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1338     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1339     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1340     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1341     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1342     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1343     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1344     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1345     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1346     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1347     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1348     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1349     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1350     real             *charge;
1351     int              nvdwtype;
1352     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1353     int              *vdwtype;
1354     real             *vdwparam;
1355     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1356     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1357     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1358     real             rswitch_scalar,d_scalar;
1359     __m128           dummy_mask,cutoff_mask;
1360     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1361     __m128           one     = _mm_set1_ps(1.0);
1362     __m128           two     = _mm_set1_ps(2.0);
1363     x                = xx[0];
1364     f                = ff[0];
1365
1366     nri              = nlist->nri;
1367     iinr             = nlist->iinr;
1368     jindex           = nlist->jindex;
1369     jjnr             = nlist->jjnr;
1370     shiftidx         = nlist->shift;
1371     gid              = nlist->gid;
1372     shiftvec         = fr->shift_vec[0];
1373     fshift           = fr->fshift[0];
1374     facel            = _mm_set1_ps(fr->ic->epsfac);
1375     charge           = mdatoms->chargeA;
1376     krf              = _mm_set1_ps(fr->ic->k_rf);
1377     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1378     crf              = _mm_set1_ps(fr->ic->c_rf);
1379     nvdwtype         = fr->ntype;
1380     vdwparam         = fr->nbfp;
1381     vdwtype          = mdatoms->typeA;
1382
1383     /* Setup water-specific parameters */
1384     inr              = nlist->iinr[0];
1385     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1386     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1387     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1388     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1389
1390     jq1              = _mm_set1_ps(charge[inr+1]);
1391     jq2              = _mm_set1_ps(charge[inr+2]);
1392     jq3              = _mm_set1_ps(charge[inr+3]);
1393     vdwjidx0A        = 2*vdwtype[inr+0];
1394     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1395     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1396     qq11             = _mm_mul_ps(iq1,jq1);
1397     qq12             = _mm_mul_ps(iq1,jq2);
1398     qq13             = _mm_mul_ps(iq1,jq3);
1399     qq21             = _mm_mul_ps(iq2,jq1);
1400     qq22             = _mm_mul_ps(iq2,jq2);
1401     qq23             = _mm_mul_ps(iq2,jq3);
1402     qq31             = _mm_mul_ps(iq3,jq1);
1403     qq32             = _mm_mul_ps(iq3,jq2);
1404     qq33             = _mm_mul_ps(iq3,jq3);
1405
1406     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1407     rcutoff_scalar   = fr->ic->rcoulomb;
1408     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1409     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1410
1411     rswitch_scalar   = fr->ic->rvdw_switch;
1412     rswitch          = _mm_set1_ps(rswitch_scalar);
1413     /* Setup switch parameters */
1414     d_scalar         = rcutoff_scalar-rswitch_scalar;
1415     d                = _mm_set1_ps(d_scalar);
1416     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1417     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1418     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1419     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1420     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1421     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1422
1423     /* Avoid stupid compiler warnings */
1424     jnrA = jnrB = jnrC = jnrD = 0;
1425     j_coord_offsetA = 0;
1426     j_coord_offsetB = 0;
1427     j_coord_offsetC = 0;
1428     j_coord_offsetD = 0;
1429
1430     outeriter        = 0;
1431     inneriter        = 0;
1432
1433     for(iidx=0;iidx<4*DIM;iidx++)
1434     {
1435         scratch[iidx] = 0.0;
1436     }  
1437
1438     /* Start outer loop over neighborlists */
1439     for(iidx=0; iidx<nri; iidx++)
1440     {
1441         /* Load shift vector for this list */
1442         i_shift_offset   = DIM*shiftidx[iidx];
1443
1444         /* Load limits for loop over neighbors */
1445         j_index_start    = jindex[iidx];
1446         j_index_end      = jindex[iidx+1];
1447
1448         /* Get outer coordinate index */
1449         inr              = iinr[iidx];
1450         i_coord_offset   = DIM*inr;
1451
1452         /* Load i particle coords and add shift vector */
1453         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1454                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1455         
1456         fix0             = _mm_setzero_ps();
1457         fiy0             = _mm_setzero_ps();
1458         fiz0             = _mm_setzero_ps();
1459         fix1             = _mm_setzero_ps();
1460         fiy1             = _mm_setzero_ps();
1461         fiz1             = _mm_setzero_ps();
1462         fix2             = _mm_setzero_ps();
1463         fiy2             = _mm_setzero_ps();
1464         fiz2             = _mm_setzero_ps();
1465         fix3             = _mm_setzero_ps();
1466         fiy3             = _mm_setzero_ps();
1467         fiz3             = _mm_setzero_ps();
1468
1469         /* Start inner kernel loop */
1470         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1471         {
1472
1473             /* Get j neighbor index, and coordinate index */
1474             jnrA             = jjnr[jidx];
1475             jnrB             = jjnr[jidx+1];
1476             jnrC             = jjnr[jidx+2];
1477             jnrD             = jjnr[jidx+3];
1478             j_coord_offsetA  = DIM*jnrA;
1479             j_coord_offsetB  = DIM*jnrB;
1480             j_coord_offsetC  = DIM*jnrC;
1481             j_coord_offsetD  = DIM*jnrD;
1482
1483             /* load j atom coordinates */
1484             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1485                                               x+j_coord_offsetC,x+j_coord_offsetD,
1486                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1487                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1488
1489             /* Calculate displacement vector */
1490             dx00             = _mm_sub_ps(ix0,jx0);
1491             dy00             = _mm_sub_ps(iy0,jy0);
1492             dz00             = _mm_sub_ps(iz0,jz0);
1493             dx11             = _mm_sub_ps(ix1,jx1);
1494             dy11             = _mm_sub_ps(iy1,jy1);
1495             dz11             = _mm_sub_ps(iz1,jz1);
1496             dx12             = _mm_sub_ps(ix1,jx2);
1497             dy12             = _mm_sub_ps(iy1,jy2);
1498             dz12             = _mm_sub_ps(iz1,jz2);
1499             dx13             = _mm_sub_ps(ix1,jx3);
1500             dy13             = _mm_sub_ps(iy1,jy3);
1501             dz13             = _mm_sub_ps(iz1,jz3);
1502             dx21             = _mm_sub_ps(ix2,jx1);
1503             dy21             = _mm_sub_ps(iy2,jy1);
1504             dz21             = _mm_sub_ps(iz2,jz1);
1505             dx22             = _mm_sub_ps(ix2,jx2);
1506             dy22             = _mm_sub_ps(iy2,jy2);
1507             dz22             = _mm_sub_ps(iz2,jz2);
1508             dx23             = _mm_sub_ps(ix2,jx3);
1509             dy23             = _mm_sub_ps(iy2,jy3);
1510             dz23             = _mm_sub_ps(iz2,jz3);
1511             dx31             = _mm_sub_ps(ix3,jx1);
1512             dy31             = _mm_sub_ps(iy3,jy1);
1513             dz31             = _mm_sub_ps(iz3,jz1);
1514             dx32             = _mm_sub_ps(ix3,jx2);
1515             dy32             = _mm_sub_ps(iy3,jy2);
1516             dz32             = _mm_sub_ps(iz3,jz2);
1517             dx33             = _mm_sub_ps(ix3,jx3);
1518             dy33             = _mm_sub_ps(iy3,jy3);
1519             dz33             = _mm_sub_ps(iz3,jz3);
1520
1521             /* Calculate squared distance and things based on it */
1522             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1523             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1524             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1525             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1526             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1527             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1528             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1529             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1530             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1531             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1532
1533             rinv00           = sse2_invsqrt_f(rsq00);
1534             rinv11           = sse2_invsqrt_f(rsq11);
1535             rinv12           = sse2_invsqrt_f(rsq12);
1536             rinv13           = sse2_invsqrt_f(rsq13);
1537             rinv21           = sse2_invsqrt_f(rsq21);
1538             rinv22           = sse2_invsqrt_f(rsq22);
1539             rinv23           = sse2_invsqrt_f(rsq23);
1540             rinv31           = sse2_invsqrt_f(rsq31);
1541             rinv32           = sse2_invsqrt_f(rsq32);
1542             rinv33           = sse2_invsqrt_f(rsq33);
1543
1544             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1545             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1546             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1547             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1548             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1549             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1550             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1551             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1552             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1553             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1554
1555             fjx0             = _mm_setzero_ps();
1556             fjy0             = _mm_setzero_ps();
1557             fjz0             = _mm_setzero_ps();
1558             fjx1             = _mm_setzero_ps();
1559             fjy1             = _mm_setzero_ps();
1560             fjz1             = _mm_setzero_ps();
1561             fjx2             = _mm_setzero_ps();
1562             fjy2             = _mm_setzero_ps();
1563             fjz2             = _mm_setzero_ps();
1564             fjx3             = _mm_setzero_ps();
1565             fjy3             = _mm_setzero_ps();
1566             fjz3             = _mm_setzero_ps();
1567
1568             /**************************
1569              * CALCULATE INTERACTIONS *
1570              **************************/
1571
1572             if (gmx_mm_any_lt(rsq00,rcutoff2))
1573             {
1574
1575             r00              = _mm_mul_ps(rsq00,rinv00);
1576
1577             /* LENNARD-JONES DISPERSION/REPULSION */
1578
1579             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1580             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1581             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1582             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1583             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1584
1585             d                = _mm_sub_ps(r00,rswitch);
1586             d                = _mm_max_ps(d,_mm_setzero_ps());
1587             d2               = _mm_mul_ps(d,d);
1588             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1589
1590             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1591
1592             /* Evaluate switch function */
1593             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1594             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1595             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1596
1597             fscal            = fvdw;
1598
1599             fscal            = _mm_and_ps(fscal,cutoff_mask);
1600
1601             /* Calculate temporary vectorial force */
1602             tx               = _mm_mul_ps(fscal,dx00);
1603             ty               = _mm_mul_ps(fscal,dy00);
1604             tz               = _mm_mul_ps(fscal,dz00);
1605
1606             /* Update vectorial force */
1607             fix0             = _mm_add_ps(fix0,tx);
1608             fiy0             = _mm_add_ps(fiy0,ty);
1609             fiz0             = _mm_add_ps(fiz0,tz);
1610
1611             fjx0             = _mm_add_ps(fjx0,tx);
1612             fjy0             = _mm_add_ps(fjy0,ty);
1613             fjz0             = _mm_add_ps(fjz0,tz);
1614             
1615             }
1616
1617             /**************************
1618              * CALCULATE INTERACTIONS *
1619              **************************/
1620
1621             if (gmx_mm_any_lt(rsq11,rcutoff2))
1622             {
1623
1624             /* REACTION-FIELD ELECTROSTATICS */
1625             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1626
1627             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1628
1629             fscal            = felec;
1630
1631             fscal            = _mm_and_ps(fscal,cutoff_mask);
1632
1633             /* Calculate temporary vectorial force */
1634             tx               = _mm_mul_ps(fscal,dx11);
1635             ty               = _mm_mul_ps(fscal,dy11);
1636             tz               = _mm_mul_ps(fscal,dz11);
1637
1638             /* Update vectorial force */
1639             fix1             = _mm_add_ps(fix1,tx);
1640             fiy1             = _mm_add_ps(fiy1,ty);
1641             fiz1             = _mm_add_ps(fiz1,tz);
1642
1643             fjx1             = _mm_add_ps(fjx1,tx);
1644             fjy1             = _mm_add_ps(fjy1,ty);
1645             fjz1             = _mm_add_ps(fjz1,tz);
1646             
1647             }
1648
1649             /**************************
1650              * CALCULATE INTERACTIONS *
1651              **************************/
1652
1653             if (gmx_mm_any_lt(rsq12,rcutoff2))
1654             {
1655
1656             /* REACTION-FIELD ELECTROSTATICS */
1657             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1658
1659             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1660
1661             fscal            = felec;
1662
1663             fscal            = _mm_and_ps(fscal,cutoff_mask);
1664
1665             /* Calculate temporary vectorial force */
1666             tx               = _mm_mul_ps(fscal,dx12);
1667             ty               = _mm_mul_ps(fscal,dy12);
1668             tz               = _mm_mul_ps(fscal,dz12);
1669
1670             /* Update vectorial force */
1671             fix1             = _mm_add_ps(fix1,tx);
1672             fiy1             = _mm_add_ps(fiy1,ty);
1673             fiz1             = _mm_add_ps(fiz1,tz);
1674
1675             fjx2             = _mm_add_ps(fjx2,tx);
1676             fjy2             = _mm_add_ps(fjy2,ty);
1677             fjz2             = _mm_add_ps(fjz2,tz);
1678             
1679             }
1680
1681             /**************************
1682              * CALCULATE INTERACTIONS *
1683              **************************/
1684
1685             if (gmx_mm_any_lt(rsq13,rcutoff2))
1686             {
1687
1688             /* REACTION-FIELD ELECTROSTATICS */
1689             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1690
1691             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1692
1693             fscal            = felec;
1694
1695             fscal            = _mm_and_ps(fscal,cutoff_mask);
1696
1697             /* Calculate temporary vectorial force */
1698             tx               = _mm_mul_ps(fscal,dx13);
1699             ty               = _mm_mul_ps(fscal,dy13);
1700             tz               = _mm_mul_ps(fscal,dz13);
1701
1702             /* Update vectorial force */
1703             fix1             = _mm_add_ps(fix1,tx);
1704             fiy1             = _mm_add_ps(fiy1,ty);
1705             fiz1             = _mm_add_ps(fiz1,tz);
1706
1707             fjx3             = _mm_add_ps(fjx3,tx);
1708             fjy3             = _mm_add_ps(fjy3,ty);
1709             fjz3             = _mm_add_ps(fjz3,tz);
1710             
1711             }
1712
1713             /**************************
1714              * CALCULATE INTERACTIONS *
1715              **************************/
1716
1717             if (gmx_mm_any_lt(rsq21,rcutoff2))
1718             {
1719
1720             /* REACTION-FIELD ELECTROSTATICS */
1721             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1722
1723             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1724
1725             fscal            = felec;
1726
1727             fscal            = _mm_and_ps(fscal,cutoff_mask);
1728
1729             /* Calculate temporary vectorial force */
1730             tx               = _mm_mul_ps(fscal,dx21);
1731             ty               = _mm_mul_ps(fscal,dy21);
1732             tz               = _mm_mul_ps(fscal,dz21);
1733
1734             /* Update vectorial force */
1735             fix2             = _mm_add_ps(fix2,tx);
1736             fiy2             = _mm_add_ps(fiy2,ty);
1737             fiz2             = _mm_add_ps(fiz2,tz);
1738
1739             fjx1             = _mm_add_ps(fjx1,tx);
1740             fjy1             = _mm_add_ps(fjy1,ty);
1741             fjz1             = _mm_add_ps(fjz1,tz);
1742             
1743             }
1744
1745             /**************************
1746              * CALCULATE INTERACTIONS *
1747              **************************/
1748
1749             if (gmx_mm_any_lt(rsq22,rcutoff2))
1750             {
1751
1752             /* REACTION-FIELD ELECTROSTATICS */
1753             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1754
1755             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1756
1757             fscal            = felec;
1758
1759             fscal            = _mm_and_ps(fscal,cutoff_mask);
1760
1761             /* Calculate temporary vectorial force */
1762             tx               = _mm_mul_ps(fscal,dx22);
1763             ty               = _mm_mul_ps(fscal,dy22);
1764             tz               = _mm_mul_ps(fscal,dz22);
1765
1766             /* Update vectorial force */
1767             fix2             = _mm_add_ps(fix2,tx);
1768             fiy2             = _mm_add_ps(fiy2,ty);
1769             fiz2             = _mm_add_ps(fiz2,tz);
1770
1771             fjx2             = _mm_add_ps(fjx2,tx);
1772             fjy2             = _mm_add_ps(fjy2,ty);
1773             fjz2             = _mm_add_ps(fjz2,tz);
1774             
1775             }
1776
1777             /**************************
1778              * CALCULATE INTERACTIONS *
1779              **************************/
1780
1781             if (gmx_mm_any_lt(rsq23,rcutoff2))
1782             {
1783
1784             /* REACTION-FIELD ELECTROSTATICS */
1785             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1786
1787             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1788
1789             fscal            = felec;
1790
1791             fscal            = _mm_and_ps(fscal,cutoff_mask);
1792
1793             /* Calculate temporary vectorial force */
1794             tx               = _mm_mul_ps(fscal,dx23);
1795             ty               = _mm_mul_ps(fscal,dy23);
1796             tz               = _mm_mul_ps(fscal,dz23);
1797
1798             /* Update vectorial force */
1799             fix2             = _mm_add_ps(fix2,tx);
1800             fiy2             = _mm_add_ps(fiy2,ty);
1801             fiz2             = _mm_add_ps(fiz2,tz);
1802
1803             fjx3             = _mm_add_ps(fjx3,tx);
1804             fjy3             = _mm_add_ps(fjy3,ty);
1805             fjz3             = _mm_add_ps(fjz3,tz);
1806             
1807             }
1808
1809             /**************************
1810              * CALCULATE INTERACTIONS *
1811              **************************/
1812
1813             if (gmx_mm_any_lt(rsq31,rcutoff2))
1814             {
1815
1816             /* REACTION-FIELD ELECTROSTATICS */
1817             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1818
1819             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1820
1821             fscal            = felec;
1822
1823             fscal            = _mm_and_ps(fscal,cutoff_mask);
1824
1825             /* Calculate temporary vectorial force */
1826             tx               = _mm_mul_ps(fscal,dx31);
1827             ty               = _mm_mul_ps(fscal,dy31);
1828             tz               = _mm_mul_ps(fscal,dz31);
1829
1830             /* Update vectorial force */
1831             fix3             = _mm_add_ps(fix3,tx);
1832             fiy3             = _mm_add_ps(fiy3,ty);
1833             fiz3             = _mm_add_ps(fiz3,tz);
1834
1835             fjx1             = _mm_add_ps(fjx1,tx);
1836             fjy1             = _mm_add_ps(fjy1,ty);
1837             fjz1             = _mm_add_ps(fjz1,tz);
1838             
1839             }
1840
1841             /**************************
1842              * CALCULATE INTERACTIONS *
1843              **************************/
1844
1845             if (gmx_mm_any_lt(rsq32,rcutoff2))
1846             {
1847
1848             /* REACTION-FIELD ELECTROSTATICS */
1849             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1850
1851             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1852
1853             fscal            = felec;
1854
1855             fscal            = _mm_and_ps(fscal,cutoff_mask);
1856
1857             /* Calculate temporary vectorial force */
1858             tx               = _mm_mul_ps(fscal,dx32);
1859             ty               = _mm_mul_ps(fscal,dy32);
1860             tz               = _mm_mul_ps(fscal,dz32);
1861
1862             /* Update vectorial force */
1863             fix3             = _mm_add_ps(fix3,tx);
1864             fiy3             = _mm_add_ps(fiy3,ty);
1865             fiz3             = _mm_add_ps(fiz3,tz);
1866
1867             fjx2             = _mm_add_ps(fjx2,tx);
1868             fjy2             = _mm_add_ps(fjy2,ty);
1869             fjz2             = _mm_add_ps(fjz2,tz);
1870             
1871             }
1872
1873             /**************************
1874              * CALCULATE INTERACTIONS *
1875              **************************/
1876
1877             if (gmx_mm_any_lt(rsq33,rcutoff2))
1878             {
1879
1880             /* REACTION-FIELD ELECTROSTATICS */
1881             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1882
1883             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1884
1885             fscal            = felec;
1886
1887             fscal            = _mm_and_ps(fscal,cutoff_mask);
1888
1889             /* Calculate temporary vectorial force */
1890             tx               = _mm_mul_ps(fscal,dx33);
1891             ty               = _mm_mul_ps(fscal,dy33);
1892             tz               = _mm_mul_ps(fscal,dz33);
1893
1894             /* Update vectorial force */
1895             fix3             = _mm_add_ps(fix3,tx);
1896             fiy3             = _mm_add_ps(fiy3,ty);
1897             fiz3             = _mm_add_ps(fiz3,tz);
1898
1899             fjx3             = _mm_add_ps(fjx3,tx);
1900             fjy3             = _mm_add_ps(fjy3,ty);
1901             fjz3             = _mm_add_ps(fjz3,tz);
1902             
1903             }
1904
1905             fjptrA             = f+j_coord_offsetA;
1906             fjptrB             = f+j_coord_offsetB;
1907             fjptrC             = f+j_coord_offsetC;
1908             fjptrD             = f+j_coord_offsetD;
1909
1910             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1911                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1912                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1913
1914             /* Inner loop uses 329 flops */
1915         }
1916
1917         if(jidx<j_index_end)
1918         {
1919
1920             /* Get j neighbor index, and coordinate index */
1921             jnrlistA         = jjnr[jidx];
1922             jnrlistB         = jjnr[jidx+1];
1923             jnrlistC         = jjnr[jidx+2];
1924             jnrlistD         = jjnr[jidx+3];
1925             /* Sign of each element will be negative for non-real atoms.
1926              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1927              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1928              */
1929             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1930             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1931             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1932             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1933             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1934             j_coord_offsetA  = DIM*jnrA;
1935             j_coord_offsetB  = DIM*jnrB;
1936             j_coord_offsetC  = DIM*jnrC;
1937             j_coord_offsetD  = DIM*jnrD;
1938
1939             /* load j atom coordinates */
1940             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1941                                               x+j_coord_offsetC,x+j_coord_offsetD,
1942                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1943                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1944
1945             /* Calculate displacement vector */
1946             dx00             = _mm_sub_ps(ix0,jx0);
1947             dy00             = _mm_sub_ps(iy0,jy0);
1948             dz00             = _mm_sub_ps(iz0,jz0);
1949             dx11             = _mm_sub_ps(ix1,jx1);
1950             dy11             = _mm_sub_ps(iy1,jy1);
1951             dz11             = _mm_sub_ps(iz1,jz1);
1952             dx12             = _mm_sub_ps(ix1,jx2);
1953             dy12             = _mm_sub_ps(iy1,jy2);
1954             dz12             = _mm_sub_ps(iz1,jz2);
1955             dx13             = _mm_sub_ps(ix1,jx3);
1956             dy13             = _mm_sub_ps(iy1,jy3);
1957             dz13             = _mm_sub_ps(iz1,jz3);
1958             dx21             = _mm_sub_ps(ix2,jx1);
1959             dy21             = _mm_sub_ps(iy2,jy1);
1960             dz21             = _mm_sub_ps(iz2,jz1);
1961             dx22             = _mm_sub_ps(ix2,jx2);
1962             dy22             = _mm_sub_ps(iy2,jy2);
1963             dz22             = _mm_sub_ps(iz2,jz2);
1964             dx23             = _mm_sub_ps(ix2,jx3);
1965             dy23             = _mm_sub_ps(iy2,jy3);
1966             dz23             = _mm_sub_ps(iz2,jz3);
1967             dx31             = _mm_sub_ps(ix3,jx1);
1968             dy31             = _mm_sub_ps(iy3,jy1);
1969             dz31             = _mm_sub_ps(iz3,jz1);
1970             dx32             = _mm_sub_ps(ix3,jx2);
1971             dy32             = _mm_sub_ps(iy3,jy2);
1972             dz32             = _mm_sub_ps(iz3,jz2);
1973             dx33             = _mm_sub_ps(ix3,jx3);
1974             dy33             = _mm_sub_ps(iy3,jy3);
1975             dz33             = _mm_sub_ps(iz3,jz3);
1976
1977             /* Calculate squared distance and things based on it */
1978             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1979             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1980             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1981             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1982             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1983             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1984             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1985             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1986             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1987             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1988
1989             rinv00           = sse2_invsqrt_f(rsq00);
1990             rinv11           = sse2_invsqrt_f(rsq11);
1991             rinv12           = sse2_invsqrt_f(rsq12);
1992             rinv13           = sse2_invsqrt_f(rsq13);
1993             rinv21           = sse2_invsqrt_f(rsq21);
1994             rinv22           = sse2_invsqrt_f(rsq22);
1995             rinv23           = sse2_invsqrt_f(rsq23);
1996             rinv31           = sse2_invsqrt_f(rsq31);
1997             rinv32           = sse2_invsqrt_f(rsq32);
1998             rinv33           = sse2_invsqrt_f(rsq33);
1999
2000             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2001             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2002             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2003             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2004             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2005             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2006             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2007             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2008             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2009             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2010
2011             fjx0             = _mm_setzero_ps();
2012             fjy0             = _mm_setzero_ps();
2013             fjz0             = _mm_setzero_ps();
2014             fjx1             = _mm_setzero_ps();
2015             fjy1             = _mm_setzero_ps();
2016             fjz1             = _mm_setzero_ps();
2017             fjx2             = _mm_setzero_ps();
2018             fjy2             = _mm_setzero_ps();
2019             fjz2             = _mm_setzero_ps();
2020             fjx3             = _mm_setzero_ps();
2021             fjy3             = _mm_setzero_ps();
2022             fjz3             = _mm_setzero_ps();
2023
2024             /**************************
2025              * CALCULATE INTERACTIONS *
2026              **************************/
2027
2028             if (gmx_mm_any_lt(rsq00,rcutoff2))
2029             {
2030
2031             r00              = _mm_mul_ps(rsq00,rinv00);
2032             r00              = _mm_andnot_ps(dummy_mask,r00);
2033
2034             /* LENNARD-JONES DISPERSION/REPULSION */
2035
2036             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2037             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2038             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2039             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2040             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2041
2042             d                = _mm_sub_ps(r00,rswitch);
2043             d                = _mm_max_ps(d,_mm_setzero_ps());
2044             d2               = _mm_mul_ps(d,d);
2045             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2046
2047             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2048
2049             /* Evaluate switch function */
2050             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2051             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2052             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2053
2054             fscal            = fvdw;
2055
2056             fscal            = _mm_and_ps(fscal,cutoff_mask);
2057
2058             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2059
2060             /* Calculate temporary vectorial force */
2061             tx               = _mm_mul_ps(fscal,dx00);
2062             ty               = _mm_mul_ps(fscal,dy00);
2063             tz               = _mm_mul_ps(fscal,dz00);
2064
2065             /* Update vectorial force */
2066             fix0             = _mm_add_ps(fix0,tx);
2067             fiy0             = _mm_add_ps(fiy0,ty);
2068             fiz0             = _mm_add_ps(fiz0,tz);
2069
2070             fjx0             = _mm_add_ps(fjx0,tx);
2071             fjy0             = _mm_add_ps(fjy0,ty);
2072             fjz0             = _mm_add_ps(fjz0,tz);
2073             
2074             }
2075
2076             /**************************
2077              * CALCULATE INTERACTIONS *
2078              **************************/
2079
2080             if (gmx_mm_any_lt(rsq11,rcutoff2))
2081             {
2082
2083             /* REACTION-FIELD ELECTROSTATICS */
2084             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2085
2086             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2087
2088             fscal            = felec;
2089
2090             fscal            = _mm_and_ps(fscal,cutoff_mask);
2091
2092             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2093
2094             /* Calculate temporary vectorial force */
2095             tx               = _mm_mul_ps(fscal,dx11);
2096             ty               = _mm_mul_ps(fscal,dy11);
2097             tz               = _mm_mul_ps(fscal,dz11);
2098
2099             /* Update vectorial force */
2100             fix1             = _mm_add_ps(fix1,tx);
2101             fiy1             = _mm_add_ps(fiy1,ty);
2102             fiz1             = _mm_add_ps(fiz1,tz);
2103
2104             fjx1             = _mm_add_ps(fjx1,tx);
2105             fjy1             = _mm_add_ps(fjy1,ty);
2106             fjz1             = _mm_add_ps(fjz1,tz);
2107             
2108             }
2109
2110             /**************************
2111              * CALCULATE INTERACTIONS *
2112              **************************/
2113
2114             if (gmx_mm_any_lt(rsq12,rcutoff2))
2115             {
2116
2117             /* REACTION-FIELD ELECTROSTATICS */
2118             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2119
2120             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2121
2122             fscal            = felec;
2123
2124             fscal            = _mm_and_ps(fscal,cutoff_mask);
2125
2126             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2127
2128             /* Calculate temporary vectorial force */
2129             tx               = _mm_mul_ps(fscal,dx12);
2130             ty               = _mm_mul_ps(fscal,dy12);
2131             tz               = _mm_mul_ps(fscal,dz12);
2132
2133             /* Update vectorial force */
2134             fix1             = _mm_add_ps(fix1,tx);
2135             fiy1             = _mm_add_ps(fiy1,ty);
2136             fiz1             = _mm_add_ps(fiz1,tz);
2137
2138             fjx2             = _mm_add_ps(fjx2,tx);
2139             fjy2             = _mm_add_ps(fjy2,ty);
2140             fjz2             = _mm_add_ps(fjz2,tz);
2141             
2142             }
2143
2144             /**************************
2145              * CALCULATE INTERACTIONS *
2146              **************************/
2147
2148             if (gmx_mm_any_lt(rsq13,rcutoff2))
2149             {
2150
2151             /* REACTION-FIELD ELECTROSTATICS */
2152             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2153
2154             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2155
2156             fscal            = felec;
2157
2158             fscal            = _mm_and_ps(fscal,cutoff_mask);
2159
2160             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2161
2162             /* Calculate temporary vectorial force */
2163             tx               = _mm_mul_ps(fscal,dx13);
2164             ty               = _mm_mul_ps(fscal,dy13);
2165             tz               = _mm_mul_ps(fscal,dz13);
2166
2167             /* Update vectorial force */
2168             fix1             = _mm_add_ps(fix1,tx);
2169             fiy1             = _mm_add_ps(fiy1,ty);
2170             fiz1             = _mm_add_ps(fiz1,tz);
2171
2172             fjx3             = _mm_add_ps(fjx3,tx);
2173             fjy3             = _mm_add_ps(fjy3,ty);
2174             fjz3             = _mm_add_ps(fjz3,tz);
2175             
2176             }
2177
2178             /**************************
2179              * CALCULATE INTERACTIONS *
2180              **************************/
2181
2182             if (gmx_mm_any_lt(rsq21,rcutoff2))
2183             {
2184
2185             /* REACTION-FIELD ELECTROSTATICS */
2186             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2187
2188             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2189
2190             fscal            = felec;
2191
2192             fscal            = _mm_and_ps(fscal,cutoff_mask);
2193
2194             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2195
2196             /* Calculate temporary vectorial force */
2197             tx               = _mm_mul_ps(fscal,dx21);
2198             ty               = _mm_mul_ps(fscal,dy21);
2199             tz               = _mm_mul_ps(fscal,dz21);
2200
2201             /* Update vectorial force */
2202             fix2             = _mm_add_ps(fix2,tx);
2203             fiy2             = _mm_add_ps(fiy2,ty);
2204             fiz2             = _mm_add_ps(fiz2,tz);
2205
2206             fjx1             = _mm_add_ps(fjx1,tx);
2207             fjy1             = _mm_add_ps(fjy1,ty);
2208             fjz1             = _mm_add_ps(fjz1,tz);
2209             
2210             }
2211
2212             /**************************
2213              * CALCULATE INTERACTIONS *
2214              **************************/
2215
2216             if (gmx_mm_any_lt(rsq22,rcutoff2))
2217             {
2218
2219             /* REACTION-FIELD ELECTROSTATICS */
2220             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2221
2222             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2223
2224             fscal            = felec;
2225
2226             fscal            = _mm_and_ps(fscal,cutoff_mask);
2227
2228             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2229
2230             /* Calculate temporary vectorial force */
2231             tx               = _mm_mul_ps(fscal,dx22);
2232             ty               = _mm_mul_ps(fscal,dy22);
2233             tz               = _mm_mul_ps(fscal,dz22);
2234
2235             /* Update vectorial force */
2236             fix2             = _mm_add_ps(fix2,tx);
2237             fiy2             = _mm_add_ps(fiy2,ty);
2238             fiz2             = _mm_add_ps(fiz2,tz);
2239
2240             fjx2             = _mm_add_ps(fjx2,tx);
2241             fjy2             = _mm_add_ps(fjy2,ty);
2242             fjz2             = _mm_add_ps(fjz2,tz);
2243             
2244             }
2245
2246             /**************************
2247              * CALCULATE INTERACTIONS *
2248              **************************/
2249
2250             if (gmx_mm_any_lt(rsq23,rcutoff2))
2251             {
2252
2253             /* REACTION-FIELD ELECTROSTATICS */
2254             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2255
2256             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2257
2258             fscal            = felec;
2259
2260             fscal            = _mm_and_ps(fscal,cutoff_mask);
2261
2262             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2263
2264             /* Calculate temporary vectorial force */
2265             tx               = _mm_mul_ps(fscal,dx23);
2266             ty               = _mm_mul_ps(fscal,dy23);
2267             tz               = _mm_mul_ps(fscal,dz23);
2268
2269             /* Update vectorial force */
2270             fix2             = _mm_add_ps(fix2,tx);
2271             fiy2             = _mm_add_ps(fiy2,ty);
2272             fiz2             = _mm_add_ps(fiz2,tz);
2273
2274             fjx3             = _mm_add_ps(fjx3,tx);
2275             fjy3             = _mm_add_ps(fjy3,ty);
2276             fjz3             = _mm_add_ps(fjz3,tz);
2277             
2278             }
2279
2280             /**************************
2281              * CALCULATE INTERACTIONS *
2282              **************************/
2283
2284             if (gmx_mm_any_lt(rsq31,rcutoff2))
2285             {
2286
2287             /* REACTION-FIELD ELECTROSTATICS */
2288             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2289
2290             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2291
2292             fscal            = felec;
2293
2294             fscal            = _mm_and_ps(fscal,cutoff_mask);
2295
2296             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2297
2298             /* Calculate temporary vectorial force */
2299             tx               = _mm_mul_ps(fscal,dx31);
2300             ty               = _mm_mul_ps(fscal,dy31);
2301             tz               = _mm_mul_ps(fscal,dz31);
2302
2303             /* Update vectorial force */
2304             fix3             = _mm_add_ps(fix3,tx);
2305             fiy3             = _mm_add_ps(fiy3,ty);
2306             fiz3             = _mm_add_ps(fiz3,tz);
2307
2308             fjx1             = _mm_add_ps(fjx1,tx);
2309             fjy1             = _mm_add_ps(fjy1,ty);
2310             fjz1             = _mm_add_ps(fjz1,tz);
2311             
2312             }
2313
2314             /**************************
2315              * CALCULATE INTERACTIONS *
2316              **************************/
2317
2318             if (gmx_mm_any_lt(rsq32,rcutoff2))
2319             {
2320
2321             /* REACTION-FIELD ELECTROSTATICS */
2322             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2323
2324             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2325
2326             fscal            = felec;
2327
2328             fscal            = _mm_and_ps(fscal,cutoff_mask);
2329
2330             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2331
2332             /* Calculate temporary vectorial force */
2333             tx               = _mm_mul_ps(fscal,dx32);
2334             ty               = _mm_mul_ps(fscal,dy32);
2335             tz               = _mm_mul_ps(fscal,dz32);
2336
2337             /* Update vectorial force */
2338             fix3             = _mm_add_ps(fix3,tx);
2339             fiy3             = _mm_add_ps(fiy3,ty);
2340             fiz3             = _mm_add_ps(fiz3,tz);
2341
2342             fjx2             = _mm_add_ps(fjx2,tx);
2343             fjy2             = _mm_add_ps(fjy2,ty);
2344             fjz2             = _mm_add_ps(fjz2,tz);
2345             
2346             }
2347
2348             /**************************
2349              * CALCULATE INTERACTIONS *
2350              **************************/
2351
2352             if (gmx_mm_any_lt(rsq33,rcutoff2))
2353             {
2354
2355             /* REACTION-FIELD ELECTROSTATICS */
2356             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2357
2358             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2359
2360             fscal            = felec;
2361
2362             fscal            = _mm_and_ps(fscal,cutoff_mask);
2363
2364             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2365
2366             /* Calculate temporary vectorial force */
2367             tx               = _mm_mul_ps(fscal,dx33);
2368             ty               = _mm_mul_ps(fscal,dy33);
2369             tz               = _mm_mul_ps(fscal,dz33);
2370
2371             /* Update vectorial force */
2372             fix3             = _mm_add_ps(fix3,tx);
2373             fiy3             = _mm_add_ps(fiy3,ty);
2374             fiz3             = _mm_add_ps(fiz3,tz);
2375
2376             fjx3             = _mm_add_ps(fjx3,tx);
2377             fjy3             = _mm_add_ps(fjy3,ty);
2378             fjz3             = _mm_add_ps(fjz3,tz);
2379             
2380             }
2381
2382             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2383             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2384             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2385             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2386
2387             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2388                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2389                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2390
2391             /* Inner loop uses 330 flops */
2392         }
2393
2394         /* End of innermost loop */
2395
2396         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2397                                               f+i_coord_offset,fshift+i_shift_offset);
2398
2399         /* Increment number of inner iterations */
2400         inneriter                  += j_index_end - j_index_start;
2401
2402         /* Outer loop uses 24 flops */
2403     }
2404
2405     /* Increment number of outer iterations */
2406     outeriter        += nri;
2407
2408     /* Update outer/inner flops */
2409
2410     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);
2411 }