Rename remaining GMX_ACCELERATION to GMX_CPU_ACCELERATION
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwioffset3;
73     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
93     real             *charge;
94     int              nvdwtype;
95     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96     int              *vdwtype;
97     real             *vdwparam;
98     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
99     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
100     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
101     real             rswitch_scalar,d_scalar;
102     __m128           dummy_mask,cutoff_mask;
103     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104     __m128           one     = _mm_set1_ps(1.0);
105     __m128           two     = _mm_set1_ps(2.0);
106     x                = xx[0];
107     f                = ff[0];
108
109     nri              = nlist->nri;
110     iinr             = nlist->iinr;
111     jindex           = nlist->jindex;
112     jjnr             = nlist->jjnr;
113     shiftidx         = nlist->shift;
114     gid              = nlist->gid;
115     shiftvec         = fr->shift_vec[0];
116     fshift           = fr->fshift[0];
117     facel            = _mm_set1_ps(fr->epsfac);
118     charge           = mdatoms->chargeA;
119     krf              = _mm_set1_ps(fr->ic->k_rf);
120     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
121     crf              = _mm_set1_ps(fr->ic->c_rf);
122     nvdwtype         = fr->ntype;
123     vdwparam         = fr->nbfp;
124     vdwtype          = mdatoms->typeA;
125
126     /* Setup water-specific parameters */
127     inr              = nlist->iinr[0];
128     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
129     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
130     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
131     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
132
133     jq1              = _mm_set1_ps(charge[inr+1]);
134     jq2              = _mm_set1_ps(charge[inr+2]);
135     jq3              = _mm_set1_ps(charge[inr+3]);
136     vdwjidx0A        = 2*vdwtype[inr+0];
137     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
138     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
139     qq11             = _mm_mul_ps(iq1,jq1);
140     qq12             = _mm_mul_ps(iq1,jq2);
141     qq13             = _mm_mul_ps(iq1,jq3);
142     qq21             = _mm_mul_ps(iq2,jq1);
143     qq22             = _mm_mul_ps(iq2,jq2);
144     qq23             = _mm_mul_ps(iq2,jq3);
145     qq31             = _mm_mul_ps(iq3,jq1);
146     qq32             = _mm_mul_ps(iq3,jq2);
147     qq33             = _mm_mul_ps(iq3,jq3);
148
149     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150     rcutoff_scalar   = fr->rcoulomb;
151     rcutoff          = _mm_set1_ps(rcutoff_scalar);
152     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
153
154     rswitch_scalar   = fr->rvdw_switch;
155     rswitch          = _mm_set1_ps(rswitch_scalar);
156     /* Setup switch parameters */
157     d_scalar         = rcutoff_scalar-rswitch_scalar;
158     d                = _mm_set1_ps(d_scalar);
159     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
160     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
163     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165
166     /* Avoid stupid compiler warnings */
167     jnrA = jnrB = jnrC = jnrD = 0;
168     j_coord_offsetA = 0;
169     j_coord_offsetB = 0;
170     j_coord_offsetC = 0;
171     j_coord_offsetD = 0;
172
173     outeriter        = 0;
174     inneriter        = 0;
175
176     /* Start outer loop over neighborlists */
177     for(iidx=0; iidx<nri; iidx++)
178     {
179         /* Load shift vector for this list */
180         i_shift_offset   = DIM*shiftidx[iidx];
181         shX              = shiftvec[i_shift_offset+XX];
182         shY              = shiftvec[i_shift_offset+YY];
183         shZ              = shiftvec[i_shift_offset+ZZ];
184
185         /* Load limits for loop over neighbors */
186         j_index_start    = jindex[iidx];
187         j_index_end      = jindex[iidx+1];
188
189         /* Get outer coordinate index */
190         inr              = iinr[iidx];
191         i_coord_offset   = DIM*inr;
192
193         /* Load i particle coords and add shift vector */
194         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
195         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
196         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
197         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
198         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
199         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
200         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
201         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
202         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
203         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
204         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
205         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
206
207         fix0             = _mm_setzero_ps();
208         fiy0             = _mm_setzero_ps();
209         fiz0             = _mm_setzero_ps();
210         fix1             = _mm_setzero_ps();
211         fiy1             = _mm_setzero_ps();
212         fiz1             = _mm_setzero_ps();
213         fix2             = _mm_setzero_ps();
214         fiy2             = _mm_setzero_ps();
215         fiz2             = _mm_setzero_ps();
216         fix3             = _mm_setzero_ps();
217         fiy3             = _mm_setzero_ps();
218         fiz3             = _mm_setzero_ps();
219
220         /* Reset potential sums */
221         velecsum         = _mm_setzero_ps();
222         vvdwsum          = _mm_setzero_ps();
223
224         /* Start inner kernel loop */
225         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
226         {
227
228             /* Get j neighbor index, and coordinate index */
229             jnrA             = jjnr[jidx];
230             jnrB             = jjnr[jidx+1];
231             jnrC             = jjnr[jidx+2];
232             jnrD             = jjnr[jidx+3];
233
234             j_coord_offsetA  = DIM*jnrA;
235             j_coord_offsetB  = DIM*jnrB;
236             j_coord_offsetC  = DIM*jnrC;
237             j_coord_offsetD  = DIM*jnrD;
238
239             /* load j atom coordinates */
240             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
241                                               x+j_coord_offsetC,x+j_coord_offsetD,
242                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243                                               &jy2,&jz2,&jx3,&jy3,&jz3);
244
245             /* Calculate displacement vector */
246             dx00             = _mm_sub_ps(ix0,jx0);
247             dy00             = _mm_sub_ps(iy0,jy0);
248             dz00             = _mm_sub_ps(iz0,jz0);
249             dx11             = _mm_sub_ps(ix1,jx1);
250             dy11             = _mm_sub_ps(iy1,jy1);
251             dz11             = _mm_sub_ps(iz1,jz1);
252             dx12             = _mm_sub_ps(ix1,jx2);
253             dy12             = _mm_sub_ps(iy1,jy2);
254             dz12             = _mm_sub_ps(iz1,jz2);
255             dx13             = _mm_sub_ps(ix1,jx3);
256             dy13             = _mm_sub_ps(iy1,jy3);
257             dz13             = _mm_sub_ps(iz1,jz3);
258             dx21             = _mm_sub_ps(ix2,jx1);
259             dy21             = _mm_sub_ps(iy2,jy1);
260             dz21             = _mm_sub_ps(iz2,jz1);
261             dx22             = _mm_sub_ps(ix2,jx2);
262             dy22             = _mm_sub_ps(iy2,jy2);
263             dz22             = _mm_sub_ps(iz2,jz2);
264             dx23             = _mm_sub_ps(ix2,jx3);
265             dy23             = _mm_sub_ps(iy2,jy3);
266             dz23             = _mm_sub_ps(iz2,jz3);
267             dx31             = _mm_sub_ps(ix3,jx1);
268             dy31             = _mm_sub_ps(iy3,jy1);
269             dz31             = _mm_sub_ps(iz3,jz1);
270             dx32             = _mm_sub_ps(ix3,jx2);
271             dy32             = _mm_sub_ps(iy3,jy2);
272             dz32             = _mm_sub_ps(iz3,jz2);
273             dx33             = _mm_sub_ps(ix3,jx3);
274             dy33             = _mm_sub_ps(iy3,jy3);
275             dz33             = _mm_sub_ps(iz3,jz3);
276
277             /* Calculate squared distance and things based on it */
278             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
279             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
280             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
281             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
282             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
283             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
284             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
285             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
286             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
287             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
288
289             rinv00           = gmx_mm_invsqrt_ps(rsq00);
290             rinv11           = gmx_mm_invsqrt_ps(rsq11);
291             rinv12           = gmx_mm_invsqrt_ps(rsq12);
292             rinv13           = gmx_mm_invsqrt_ps(rsq13);
293             rinv21           = gmx_mm_invsqrt_ps(rsq21);
294             rinv22           = gmx_mm_invsqrt_ps(rsq22);
295             rinv23           = gmx_mm_invsqrt_ps(rsq23);
296             rinv31           = gmx_mm_invsqrt_ps(rsq31);
297             rinv32           = gmx_mm_invsqrt_ps(rsq32);
298             rinv33           = gmx_mm_invsqrt_ps(rsq33);
299
300             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
301             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
302             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
303             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
304             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
305             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
306             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
307             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
308             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
309             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
310
311             fjx0             = _mm_setzero_ps();
312             fjy0             = _mm_setzero_ps();
313             fjz0             = _mm_setzero_ps();
314             fjx1             = _mm_setzero_ps();
315             fjy1             = _mm_setzero_ps();
316             fjz1             = _mm_setzero_ps();
317             fjx2             = _mm_setzero_ps();
318             fjy2             = _mm_setzero_ps();
319             fjz2             = _mm_setzero_ps();
320             fjx3             = _mm_setzero_ps();
321             fjy3             = _mm_setzero_ps();
322             fjz3             = _mm_setzero_ps();
323
324             /**************************
325              * CALCULATE INTERACTIONS *
326              **************************/
327
328             if (gmx_mm_any_lt(rsq00,rcutoff2))
329             {
330
331             r00              = _mm_mul_ps(rsq00,rinv00);
332
333             /* LENNARD-JONES DISPERSION/REPULSION */
334
335             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
336             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
337             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
338             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
339             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
340
341             d                = _mm_sub_ps(r00,rswitch);
342             d                = _mm_max_ps(d,_mm_setzero_ps());
343             d2               = _mm_mul_ps(d,d);
344             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
345
346             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
347
348             /* Evaluate switch function */
349             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
350             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
351             vvdw             = _mm_mul_ps(vvdw,sw);
352             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
353
354             /* Update potential sum for this i atom from the interaction with this j atom. */
355             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
356             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
357
358             fscal            = fvdw;
359
360             fscal            = _mm_and_ps(fscal,cutoff_mask);
361
362             /* Calculate temporary vectorial force */
363             tx               = _mm_mul_ps(fscal,dx00);
364             ty               = _mm_mul_ps(fscal,dy00);
365             tz               = _mm_mul_ps(fscal,dz00);
366
367             /* Update vectorial force */
368             fix0             = _mm_add_ps(fix0,tx);
369             fiy0             = _mm_add_ps(fiy0,ty);
370             fiz0             = _mm_add_ps(fiz0,tz);
371
372             fjx0             = _mm_add_ps(fjx0,tx);
373             fjy0             = _mm_add_ps(fjy0,ty);
374             fjz0             = _mm_add_ps(fjz0,tz);
375
376             }
377
378             /**************************
379              * CALCULATE INTERACTIONS *
380              **************************/
381
382             if (gmx_mm_any_lt(rsq11,rcutoff2))
383             {
384
385             /* REACTION-FIELD ELECTROSTATICS */
386             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
387             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
388
389             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
390
391             /* Update potential sum for this i atom from the interaction with this j atom. */
392             velec            = _mm_and_ps(velec,cutoff_mask);
393             velecsum         = _mm_add_ps(velecsum,velec);
394
395             fscal            = felec;
396
397             fscal            = _mm_and_ps(fscal,cutoff_mask);
398
399             /* Calculate temporary vectorial force */
400             tx               = _mm_mul_ps(fscal,dx11);
401             ty               = _mm_mul_ps(fscal,dy11);
402             tz               = _mm_mul_ps(fscal,dz11);
403
404             /* Update vectorial force */
405             fix1             = _mm_add_ps(fix1,tx);
406             fiy1             = _mm_add_ps(fiy1,ty);
407             fiz1             = _mm_add_ps(fiz1,tz);
408
409             fjx1             = _mm_add_ps(fjx1,tx);
410             fjy1             = _mm_add_ps(fjy1,ty);
411             fjz1             = _mm_add_ps(fjz1,tz);
412
413             }
414
415             /**************************
416              * CALCULATE INTERACTIONS *
417              **************************/
418
419             if (gmx_mm_any_lt(rsq12,rcutoff2))
420             {
421
422             /* REACTION-FIELD ELECTROSTATICS */
423             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
424             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
425
426             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
427
428             /* Update potential sum for this i atom from the interaction with this j atom. */
429             velec            = _mm_and_ps(velec,cutoff_mask);
430             velecsum         = _mm_add_ps(velecsum,velec);
431
432             fscal            = felec;
433
434             fscal            = _mm_and_ps(fscal,cutoff_mask);
435
436             /* Calculate temporary vectorial force */
437             tx               = _mm_mul_ps(fscal,dx12);
438             ty               = _mm_mul_ps(fscal,dy12);
439             tz               = _mm_mul_ps(fscal,dz12);
440
441             /* Update vectorial force */
442             fix1             = _mm_add_ps(fix1,tx);
443             fiy1             = _mm_add_ps(fiy1,ty);
444             fiz1             = _mm_add_ps(fiz1,tz);
445
446             fjx2             = _mm_add_ps(fjx2,tx);
447             fjy2             = _mm_add_ps(fjy2,ty);
448             fjz2             = _mm_add_ps(fjz2,tz);
449
450             }
451
452             /**************************
453              * CALCULATE INTERACTIONS *
454              **************************/
455
456             if (gmx_mm_any_lt(rsq13,rcutoff2))
457             {
458
459             /* REACTION-FIELD ELECTROSTATICS */
460             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
461             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
462
463             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
464
465             /* Update potential sum for this i atom from the interaction with this j atom. */
466             velec            = _mm_and_ps(velec,cutoff_mask);
467             velecsum         = _mm_add_ps(velecsum,velec);
468
469             fscal            = felec;
470
471             fscal            = _mm_and_ps(fscal,cutoff_mask);
472
473             /* Calculate temporary vectorial force */
474             tx               = _mm_mul_ps(fscal,dx13);
475             ty               = _mm_mul_ps(fscal,dy13);
476             tz               = _mm_mul_ps(fscal,dz13);
477
478             /* Update vectorial force */
479             fix1             = _mm_add_ps(fix1,tx);
480             fiy1             = _mm_add_ps(fiy1,ty);
481             fiz1             = _mm_add_ps(fiz1,tz);
482
483             fjx3             = _mm_add_ps(fjx3,tx);
484             fjy3             = _mm_add_ps(fjy3,ty);
485             fjz3             = _mm_add_ps(fjz3,tz);
486
487             }
488
489             /**************************
490              * CALCULATE INTERACTIONS *
491              **************************/
492
493             if (gmx_mm_any_lt(rsq21,rcutoff2))
494             {
495
496             /* REACTION-FIELD ELECTROSTATICS */
497             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
498             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
499
500             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
501
502             /* Update potential sum for this i atom from the interaction with this j atom. */
503             velec            = _mm_and_ps(velec,cutoff_mask);
504             velecsum         = _mm_add_ps(velecsum,velec);
505
506             fscal            = felec;
507
508             fscal            = _mm_and_ps(fscal,cutoff_mask);
509
510             /* Calculate temporary vectorial force */
511             tx               = _mm_mul_ps(fscal,dx21);
512             ty               = _mm_mul_ps(fscal,dy21);
513             tz               = _mm_mul_ps(fscal,dz21);
514
515             /* Update vectorial force */
516             fix2             = _mm_add_ps(fix2,tx);
517             fiy2             = _mm_add_ps(fiy2,ty);
518             fiz2             = _mm_add_ps(fiz2,tz);
519
520             fjx1             = _mm_add_ps(fjx1,tx);
521             fjy1             = _mm_add_ps(fjy1,ty);
522             fjz1             = _mm_add_ps(fjz1,tz);
523
524             }
525
526             /**************************
527              * CALCULATE INTERACTIONS *
528              **************************/
529
530             if (gmx_mm_any_lt(rsq22,rcutoff2))
531             {
532
533             /* REACTION-FIELD ELECTROSTATICS */
534             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
535             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
536
537             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
538
539             /* Update potential sum for this i atom from the interaction with this j atom. */
540             velec            = _mm_and_ps(velec,cutoff_mask);
541             velecsum         = _mm_add_ps(velecsum,velec);
542
543             fscal            = felec;
544
545             fscal            = _mm_and_ps(fscal,cutoff_mask);
546
547             /* Calculate temporary vectorial force */
548             tx               = _mm_mul_ps(fscal,dx22);
549             ty               = _mm_mul_ps(fscal,dy22);
550             tz               = _mm_mul_ps(fscal,dz22);
551
552             /* Update vectorial force */
553             fix2             = _mm_add_ps(fix2,tx);
554             fiy2             = _mm_add_ps(fiy2,ty);
555             fiz2             = _mm_add_ps(fiz2,tz);
556
557             fjx2             = _mm_add_ps(fjx2,tx);
558             fjy2             = _mm_add_ps(fjy2,ty);
559             fjz2             = _mm_add_ps(fjz2,tz);
560
561             }
562
563             /**************************
564              * CALCULATE INTERACTIONS *
565              **************************/
566
567             if (gmx_mm_any_lt(rsq23,rcutoff2))
568             {
569
570             /* REACTION-FIELD ELECTROSTATICS */
571             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
572             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
573
574             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
575
576             /* Update potential sum for this i atom from the interaction with this j atom. */
577             velec            = _mm_and_ps(velec,cutoff_mask);
578             velecsum         = _mm_add_ps(velecsum,velec);
579
580             fscal            = felec;
581
582             fscal            = _mm_and_ps(fscal,cutoff_mask);
583
584             /* Calculate temporary vectorial force */
585             tx               = _mm_mul_ps(fscal,dx23);
586             ty               = _mm_mul_ps(fscal,dy23);
587             tz               = _mm_mul_ps(fscal,dz23);
588
589             /* Update vectorial force */
590             fix2             = _mm_add_ps(fix2,tx);
591             fiy2             = _mm_add_ps(fiy2,ty);
592             fiz2             = _mm_add_ps(fiz2,tz);
593
594             fjx3             = _mm_add_ps(fjx3,tx);
595             fjy3             = _mm_add_ps(fjy3,ty);
596             fjz3             = _mm_add_ps(fjz3,tz);
597
598             }
599
600             /**************************
601              * CALCULATE INTERACTIONS *
602              **************************/
603
604             if (gmx_mm_any_lt(rsq31,rcutoff2))
605             {
606
607             /* REACTION-FIELD ELECTROSTATICS */
608             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
609             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
610
611             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
612
613             /* Update potential sum for this i atom from the interaction with this j atom. */
614             velec            = _mm_and_ps(velec,cutoff_mask);
615             velecsum         = _mm_add_ps(velecsum,velec);
616
617             fscal            = felec;
618
619             fscal            = _mm_and_ps(fscal,cutoff_mask);
620
621             /* Calculate temporary vectorial force */
622             tx               = _mm_mul_ps(fscal,dx31);
623             ty               = _mm_mul_ps(fscal,dy31);
624             tz               = _mm_mul_ps(fscal,dz31);
625
626             /* Update vectorial force */
627             fix3             = _mm_add_ps(fix3,tx);
628             fiy3             = _mm_add_ps(fiy3,ty);
629             fiz3             = _mm_add_ps(fiz3,tz);
630
631             fjx1             = _mm_add_ps(fjx1,tx);
632             fjy1             = _mm_add_ps(fjy1,ty);
633             fjz1             = _mm_add_ps(fjz1,tz);
634
635             }
636
637             /**************************
638              * CALCULATE INTERACTIONS *
639              **************************/
640
641             if (gmx_mm_any_lt(rsq32,rcutoff2))
642             {
643
644             /* REACTION-FIELD ELECTROSTATICS */
645             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
646             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
647
648             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
649
650             /* Update potential sum for this i atom from the interaction with this j atom. */
651             velec            = _mm_and_ps(velec,cutoff_mask);
652             velecsum         = _mm_add_ps(velecsum,velec);
653
654             fscal            = felec;
655
656             fscal            = _mm_and_ps(fscal,cutoff_mask);
657
658             /* Calculate temporary vectorial force */
659             tx               = _mm_mul_ps(fscal,dx32);
660             ty               = _mm_mul_ps(fscal,dy32);
661             tz               = _mm_mul_ps(fscal,dz32);
662
663             /* Update vectorial force */
664             fix3             = _mm_add_ps(fix3,tx);
665             fiy3             = _mm_add_ps(fiy3,ty);
666             fiz3             = _mm_add_ps(fiz3,tz);
667
668             fjx2             = _mm_add_ps(fjx2,tx);
669             fjy2             = _mm_add_ps(fjy2,ty);
670             fjz2             = _mm_add_ps(fjz2,tz);
671
672             }
673
674             /**************************
675              * CALCULATE INTERACTIONS *
676              **************************/
677
678             if (gmx_mm_any_lt(rsq33,rcutoff2))
679             {
680
681             /* REACTION-FIELD ELECTROSTATICS */
682             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
683             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
684
685             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
686
687             /* Update potential sum for this i atom from the interaction with this j atom. */
688             velec            = _mm_and_ps(velec,cutoff_mask);
689             velecsum         = _mm_add_ps(velecsum,velec);
690
691             fscal            = felec;
692
693             fscal            = _mm_and_ps(fscal,cutoff_mask);
694
695             /* Calculate temporary vectorial force */
696             tx               = _mm_mul_ps(fscal,dx33);
697             ty               = _mm_mul_ps(fscal,dy33);
698             tz               = _mm_mul_ps(fscal,dz33);
699
700             /* Update vectorial force */
701             fix3             = _mm_add_ps(fix3,tx);
702             fiy3             = _mm_add_ps(fiy3,ty);
703             fiz3             = _mm_add_ps(fiz3,tz);
704
705             fjx3             = _mm_add_ps(fjx3,tx);
706             fjy3             = _mm_add_ps(fjy3,ty);
707             fjz3             = _mm_add_ps(fjz3,tz);
708
709             }
710
711             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
712                                                    f+j_coord_offsetC,f+j_coord_offsetD,
713                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
714                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
715
716             /* Inner loop uses 386 flops */
717         }
718
719         if(jidx<j_index_end)
720         {
721
722             /* Get j neighbor index, and coordinate index */
723             jnrA             = jjnr[jidx];
724             jnrB             = jjnr[jidx+1];
725             jnrC             = jjnr[jidx+2];
726             jnrD             = jjnr[jidx+3];
727
728             /* Sign of each element will be negative for non-real atoms.
729              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
730              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
731              */
732             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
733             jnrA       = (jnrA>=0) ? jnrA : 0;
734             jnrB       = (jnrB>=0) ? jnrB : 0;
735             jnrC       = (jnrC>=0) ? jnrC : 0;
736             jnrD       = (jnrD>=0) ? jnrD : 0;
737
738             j_coord_offsetA  = DIM*jnrA;
739             j_coord_offsetB  = DIM*jnrB;
740             j_coord_offsetC  = DIM*jnrC;
741             j_coord_offsetD  = DIM*jnrD;
742
743             /* load j atom coordinates */
744             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
745                                               x+j_coord_offsetC,x+j_coord_offsetD,
746                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
747                                               &jy2,&jz2,&jx3,&jy3,&jz3);
748
749             /* Calculate displacement vector */
750             dx00             = _mm_sub_ps(ix0,jx0);
751             dy00             = _mm_sub_ps(iy0,jy0);
752             dz00             = _mm_sub_ps(iz0,jz0);
753             dx11             = _mm_sub_ps(ix1,jx1);
754             dy11             = _mm_sub_ps(iy1,jy1);
755             dz11             = _mm_sub_ps(iz1,jz1);
756             dx12             = _mm_sub_ps(ix1,jx2);
757             dy12             = _mm_sub_ps(iy1,jy2);
758             dz12             = _mm_sub_ps(iz1,jz2);
759             dx13             = _mm_sub_ps(ix1,jx3);
760             dy13             = _mm_sub_ps(iy1,jy3);
761             dz13             = _mm_sub_ps(iz1,jz3);
762             dx21             = _mm_sub_ps(ix2,jx1);
763             dy21             = _mm_sub_ps(iy2,jy1);
764             dz21             = _mm_sub_ps(iz2,jz1);
765             dx22             = _mm_sub_ps(ix2,jx2);
766             dy22             = _mm_sub_ps(iy2,jy2);
767             dz22             = _mm_sub_ps(iz2,jz2);
768             dx23             = _mm_sub_ps(ix2,jx3);
769             dy23             = _mm_sub_ps(iy2,jy3);
770             dz23             = _mm_sub_ps(iz2,jz3);
771             dx31             = _mm_sub_ps(ix3,jx1);
772             dy31             = _mm_sub_ps(iy3,jy1);
773             dz31             = _mm_sub_ps(iz3,jz1);
774             dx32             = _mm_sub_ps(ix3,jx2);
775             dy32             = _mm_sub_ps(iy3,jy2);
776             dz32             = _mm_sub_ps(iz3,jz2);
777             dx33             = _mm_sub_ps(ix3,jx3);
778             dy33             = _mm_sub_ps(iy3,jy3);
779             dz33             = _mm_sub_ps(iz3,jz3);
780
781             /* Calculate squared distance and things based on it */
782             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
783             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
784             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
785             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
786             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
787             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
788             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
789             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
790             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
791             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
792
793             rinv00           = gmx_mm_invsqrt_ps(rsq00);
794             rinv11           = gmx_mm_invsqrt_ps(rsq11);
795             rinv12           = gmx_mm_invsqrt_ps(rsq12);
796             rinv13           = gmx_mm_invsqrt_ps(rsq13);
797             rinv21           = gmx_mm_invsqrt_ps(rsq21);
798             rinv22           = gmx_mm_invsqrt_ps(rsq22);
799             rinv23           = gmx_mm_invsqrt_ps(rsq23);
800             rinv31           = gmx_mm_invsqrt_ps(rsq31);
801             rinv32           = gmx_mm_invsqrt_ps(rsq32);
802             rinv33           = gmx_mm_invsqrt_ps(rsq33);
803
804             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
805             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
806             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
807             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
808             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
809             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
810             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
811             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
812             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
813             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
814
815             fjx0             = _mm_setzero_ps();
816             fjy0             = _mm_setzero_ps();
817             fjz0             = _mm_setzero_ps();
818             fjx1             = _mm_setzero_ps();
819             fjy1             = _mm_setzero_ps();
820             fjz1             = _mm_setzero_ps();
821             fjx2             = _mm_setzero_ps();
822             fjy2             = _mm_setzero_ps();
823             fjz2             = _mm_setzero_ps();
824             fjx3             = _mm_setzero_ps();
825             fjy3             = _mm_setzero_ps();
826             fjz3             = _mm_setzero_ps();
827
828             /**************************
829              * CALCULATE INTERACTIONS *
830              **************************/
831
832             if (gmx_mm_any_lt(rsq00,rcutoff2))
833             {
834
835             r00              = _mm_mul_ps(rsq00,rinv00);
836             r00              = _mm_andnot_ps(dummy_mask,r00);
837
838             /* LENNARD-JONES DISPERSION/REPULSION */
839
840             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
841             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
842             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
843             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
844             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
845
846             d                = _mm_sub_ps(r00,rswitch);
847             d                = _mm_max_ps(d,_mm_setzero_ps());
848             d2               = _mm_mul_ps(d,d);
849             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
850
851             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
852
853             /* Evaluate switch function */
854             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
855             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
856             vvdw             = _mm_mul_ps(vvdw,sw);
857             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
858
859             /* Update potential sum for this i atom from the interaction with this j atom. */
860             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
861             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
862             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
863
864             fscal            = fvdw;
865
866             fscal            = _mm_and_ps(fscal,cutoff_mask);
867
868             fscal            = _mm_andnot_ps(dummy_mask,fscal);
869
870             /* Calculate temporary vectorial force */
871             tx               = _mm_mul_ps(fscal,dx00);
872             ty               = _mm_mul_ps(fscal,dy00);
873             tz               = _mm_mul_ps(fscal,dz00);
874
875             /* Update vectorial force */
876             fix0             = _mm_add_ps(fix0,tx);
877             fiy0             = _mm_add_ps(fiy0,ty);
878             fiz0             = _mm_add_ps(fiz0,tz);
879
880             fjx0             = _mm_add_ps(fjx0,tx);
881             fjy0             = _mm_add_ps(fjy0,ty);
882             fjz0             = _mm_add_ps(fjz0,tz);
883
884             }
885
886             /**************************
887              * CALCULATE INTERACTIONS *
888              **************************/
889
890             if (gmx_mm_any_lt(rsq11,rcutoff2))
891             {
892
893             /* REACTION-FIELD ELECTROSTATICS */
894             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
895             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
896
897             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
898
899             /* Update potential sum for this i atom from the interaction with this j atom. */
900             velec            = _mm_and_ps(velec,cutoff_mask);
901             velec            = _mm_andnot_ps(dummy_mask,velec);
902             velecsum         = _mm_add_ps(velecsum,velec);
903
904             fscal            = felec;
905
906             fscal            = _mm_and_ps(fscal,cutoff_mask);
907
908             fscal            = _mm_andnot_ps(dummy_mask,fscal);
909
910             /* Calculate temporary vectorial force */
911             tx               = _mm_mul_ps(fscal,dx11);
912             ty               = _mm_mul_ps(fscal,dy11);
913             tz               = _mm_mul_ps(fscal,dz11);
914
915             /* Update vectorial force */
916             fix1             = _mm_add_ps(fix1,tx);
917             fiy1             = _mm_add_ps(fiy1,ty);
918             fiz1             = _mm_add_ps(fiz1,tz);
919
920             fjx1             = _mm_add_ps(fjx1,tx);
921             fjy1             = _mm_add_ps(fjy1,ty);
922             fjz1             = _mm_add_ps(fjz1,tz);
923
924             }
925
926             /**************************
927              * CALCULATE INTERACTIONS *
928              **************************/
929
930             if (gmx_mm_any_lt(rsq12,rcutoff2))
931             {
932
933             /* REACTION-FIELD ELECTROSTATICS */
934             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
935             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
936
937             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
938
939             /* Update potential sum for this i atom from the interaction with this j atom. */
940             velec            = _mm_and_ps(velec,cutoff_mask);
941             velec            = _mm_andnot_ps(dummy_mask,velec);
942             velecsum         = _mm_add_ps(velecsum,velec);
943
944             fscal            = felec;
945
946             fscal            = _mm_and_ps(fscal,cutoff_mask);
947
948             fscal            = _mm_andnot_ps(dummy_mask,fscal);
949
950             /* Calculate temporary vectorial force */
951             tx               = _mm_mul_ps(fscal,dx12);
952             ty               = _mm_mul_ps(fscal,dy12);
953             tz               = _mm_mul_ps(fscal,dz12);
954
955             /* Update vectorial force */
956             fix1             = _mm_add_ps(fix1,tx);
957             fiy1             = _mm_add_ps(fiy1,ty);
958             fiz1             = _mm_add_ps(fiz1,tz);
959
960             fjx2             = _mm_add_ps(fjx2,tx);
961             fjy2             = _mm_add_ps(fjy2,ty);
962             fjz2             = _mm_add_ps(fjz2,tz);
963
964             }
965
966             /**************************
967              * CALCULATE INTERACTIONS *
968              **************************/
969
970             if (gmx_mm_any_lt(rsq13,rcutoff2))
971             {
972
973             /* REACTION-FIELD ELECTROSTATICS */
974             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
975             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
976
977             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
978
979             /* Update potential sum for this i atom from the interaction with this j atom. */
980             velec            = _mm_and_ps(velec,cutoff_mask);
981             velec            = _mm_andnot_ps(dummy_mask,velec);
982             velecsum         = _mm_add_ps(velecsum,velec);
983
984             fscal            = felec;
985
986             fscal            = _mm_and_ps(fscal,cutoff_mask);
987
988             fscal            = _mm_andnot_ps(dummy_mask,fscal);
989
990             /* Calculate temporary vectorial force */
991             tx               = _mm_mul_ps(fscal,dx13);
992             ty               = _mm_mul_ps(fscal,dy13);
993             tz               = _mm_mul_ps(fscal,dz13);
994
995             /* Update vectorial force */
996             fix1             = _mm_add_ps(fix1,tx);
997             fiy1             = _mm_add_ps(fiy1,ty);
998             fiz1             = _mm_add_ps(fiz1,tz);
999
1000             fjx3             = _mm_add_ps(fjx3,tx);
1001             fjy3             = _mm_add_ps(fjy3,ty);
1002             fjz3             = _mm_add_ps(fjz3,tz);
1003
1004             }
1005
1006             /**************************
1007              * CALCULATE INTERACTIONS *
1008              **************************/
1009
1010             if (gmx_mm_any_lt(rsq21,rcutoff2))
1011             {
1012
1013             /* REACTION-FIELD ELECTROSTATICS */
1014             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1015             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1016
1017             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1018
1019             /* Update potential sum for this i atom from the interaction with this j atom. */
1020             velec            = _mm_and_ps(velec,cutoff_mask);
1021             velec            = _mm_andnot_ps(dummy_mask,velec);
1022             velecsum         = _mm_add_ps(velecsum,velec);
1023
1024             fscal            = felec;
1025
1026             fscal            = _mm_and_ps(fscal,cutoff_mask);
1027
1028             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1029
1030             /* Calculate temporary vectorial force */
1031             tx               = _mm_mul_ps(fscal,dx21);
1032             ty               = _mm_mul_ps(fscal,dy21);
1033             tz               = _mm_mul_ps(fscal,dz21);
1034
1035             /* Update vectorial force */
1036             fix2             = _mm_add_ps(fix2,tx);
1037             fiy2             = _mm_add_ps(fiy2,ty);
1038             fiz2             = _mm_add_ps(fiz2,tz);
1039
1040             fjx1             = _mm_add_ps(fjx1,tx);
1041             fjy1             = _mm_add_ps(fjy1,ty);
1042             fjz1             = _mm_add_ps(fjz1,tz);
1043
1044             }
1045
1046             /**************************
1047              * CALCULATE INTERACTIONS *
1048              **************************/
1049
1050             if (gmx_mm_any_lt(rsq22,rcutoff2))
1051             {
1052
1053             /* REACTION-FIELD ELECTROSTATICS */
1054             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1055             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1056
1057             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1058
1059             /* Update potential sum for this i atom from the interaction with this j atom. */
1060             velec            = _mm_and_ps(velec,cutoff_mask);
1061             velec            = _mm_andnot_ps(dummy_mask,velec);
1062             velecsum         = _mm_add_ps(velecsum,velec);
1063
1064             fscal            = felec;
1065
1066             fscal            = _mm_and_ps(fscal,cutoff_mask);
1067
1068             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1069
1070             /* Calculate temporary vectorial force */
1071             tx               = _mm_mul_ps(fscal,dx22);
1072             ty               = _mm_mul_ps(fscal,dy22);
1073             tz               = _mm_mul_ps(fscal,dz22);
1074
1075             /* Update vectorial force */
1076             fix2             = _mm_add_ps(fix2,tx);
1077             fiy2             = _mm_add_ps(fiy2,ty);
1078             fiz2             = _mm_add_ps(fiz2,tz);
1079
1080             fjx2             = _mm_add_ps(fjx2,tx);
1081             fjy2             = _mm_add_ps(fjy2,ty);
1082             fjz2             = _mm_add_ps(fjz2,tz);
1083
1084             }
1085
1086             /**************************
1087              * CALCULATE INTERACTIONS *
1088              **************************/
1089
1090             if (gmx_mm_any_lt(rsq23,rcutoff2))
1091             {
1092
1093             /* REACTION-FIELD ELECTROSTATICS */
1094             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1095             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1096
1097             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1098
1099             /* Update potential sum for this i atom from the interaction with this j atom. */
1100             velec            = _mm_and_ps(velec,cutoff_mask);
1101             velec            = _mm_andnot_ps(dummy_mask,velec);
1102             velecsum         = _mm_add_ps(velecsum,velec);
1103
1104             fscal            = felec;
1105
1106             fscal            = _mm_and_ps(fscal,cutoff_mask);
1107
1108             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1109
1110             /* Calculate temporary vectorial force */
1111             tx               = _mm_mul_ps(fscal,dx23);
1112             ty               = _mm_mul_ps(fscal,dy23);
1113             tz               = _mm_mul_ps(fscal,dz23);
1114
1115             /* Update vectorial force */
1116             fix2             = _mm_add_ps(fix2,tx);
1117             fiy2             = _mm_add_ps(fiy2,ty);
1118             fiz2             = _mm_add_ps(fiz2,tz);
1119
1120             fjx3             = _mm_add_ps(fjx3,tx);
1121             fjy3             = _mm_add_ps(fjy3,ty);
1122             fjz3             = _mm_add_ps(fjz3,tz);
1123
1124             }
1125
1126             /**************************
1127              * CALCULATE INTERACTIONS *
1128              **************************/
1129
1130             if (gmx_mm_any_lt(rsq31,rcutoff2))
1131             {
1132
1133             /* REACTION-FIELD ELECTROSTATICS */
1134             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1135             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1136
1137             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1138
1139             /* Update potential sum for this i atom from the interaction with this j atom. */
1140             velec            = _mm_and_ps(velec,cutoff_mask);
1141             velec            = _mm_andnot_ps(dummy_mask,velec);
1142             velecsum         = _mm_add_ps(velecsum,velec);
1143
1144             fscal            = felec;
1145
1146             fscal            = _mm_and_ps(fscal,cutoff_mask);
1147
1148             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1149
1150             /* Calculate temporary vectorial force */
1151             tx               = _mm_mul_ps(fscal,dx31);
1152             ty               = _mm_mul_ps(fscal,dy31);
1153             tz               = _mm_mul_ps(fscal,dz31);
1154
1155             /* Update vectorial force */
1156             fix3             = _mm_add_ps(fix3,tx);
1157             fiy3             = _mm_add_ps(fiy3,ty);
1158             fiz3             = _mm_add_ps(fiz3,tz);
1159
1160             fjx1             = _mm_add_ps(fjx1,tx);
1161             fjy1             = _mm_add_ps(fjy1,ty);
1162             fjz1             = _mm_add_ps(fjz1,tz);
1163
1164             }
1165
1166             /**************************
1167              * CALCULATE INTERACTIONS *
1168              **************************/
1169
1170             if (gmx_mm_any_lt(rsq32,rcutoff2))
1171             {
1172
1173             /* REACTION-FIELD ELECTROSTATICS */
1174             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1175             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1176
1177             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1178
1179             /* Update potential sum for this i atom from the interaction with this j atom. */
1180             velec            = _mm_and_ps(velec,cutoff_mask);
1181             velec            = _mm_andnot_ps(dummy_mask,velec);
1182             velecsum         = _mm_add_ps(velecsum,velec);
1183
1184             fscal            = felec;
1185
1186             fscal            = _mm_and_ps(fscal,cutoff_mask);
1187
1188             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1189
1190             /* Calculate temporary vectorial force */
1191             tx               = _mm_mul_ps(fscal,dx32);
1192             ty               = _mm_mul_ps(fscal,dy32);
1193             tz               = _mm_mul_ps(fscal,dz32);
1194
1195             /* Update vectorial force */
1196             fix3             = _mm_add_ps(fix3,tx);
1197             fiy3             = _mm_add_ps(fiy3,ty);
1198             fiz3             = _mm_add_ps(fiz3,tz);
1199
1200             fjx2             = _mm_add_ps(fjx2,tx);
1201             fjy2             = _mm_add_ps(fjy2,ty);
1202             fjz2             = _mm_add_ps(fjz2,tz);
1203
1204             }
1205
1206             /**************************
1207              * CALCULATE INTERACTIONS *
1208              **************************/
1209
1210             if (gmx_mm_any_lt(rsq33,rcutoff2))
1211             {
1212
1213             /* REACTION-FIELD ELECTROSTATICS */
1214             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1215             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1216
1217             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1218
1219             /* Update potential sum for this i atom from the interaction with this j atom. */
1220             velec            = _mm_and_ps(velec,cutoff_mask);
1221             velec            = _mm_andnot_ps(dummy_mask,velec);
1222             velecsum         = _mm_add_ps(velecsum,velec);
1223
1224             fscal            = felec;
1225
1226             fscal            = _mm_and_ps(fscal,cutoff_mask);
1227
1228             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1229
1230             /* Calculate temporary vectorial force */
1231             tx               = _mm_mul_ps(fscal,dx33);
1232             ty               = _mm_mul_ps(fscal,dy33);
1233             tz               = _mm_mul_ps(fscal,dz33);
1234
1235             /* Update vectorial force */
1236             fix3             = _mm_add_ps(fix3,tx);
1237             fiy3             = _mm_add_ps(fiy3,ty);
1238             fiz3             = _mm_add_ps(fiz3,tz);
1239
1240             fjx3             = _mm_add_ps(fjx3,tx);
1241             fjy3             = _mm_add_ps(fjy3,ty);
1242             fjz3             = _mm_add_ps(fjz3,tz);
1243
1244             }
1245
1246             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1247                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1248                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1249                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1250
1251             /* Inner loop uses 387 flops */
1252         }
1253
1254         /* End of innermost loop */
1255
1256         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1257                                               f+i_coord_offset,fshift+i_shift_offset);
1258
1259         ggid                        = gid[iidx];
1260         /* Update potential energies */
1261         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1262         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1263
1264         /* Increment number of inner iterations */
1265         inneriter                  += j_index_end - j_index_start;
1266
1267         /* Outer loop uses 38 flops */
1268     }
1269
1270     /* Increment number of outer iterations */
1271     outeriter        += nri;
1272
1273     /* Update outer/inner flops */
1274
1275     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*387);
1276 }
1277 /*
1278  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
1279  * Electrostatics interaction: ReactionField
1280  * VdW interaction:            LennardJones
1281  * Geometry:                   Water4-Water4
1282  * Calculate force/pot:        Force
1283  */
1284 void
1285 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
1286                     (t_nblist * gmx_restrict                nlist,
1287                      rvec * gmx_restrict                    xx,
1288                      rvec * gmx_restrict                    ff,
1289                      t_forcerec * gmx_restrict              fr,
1290                      t_mdatoms * gmx_restrict               mdatoms,
1291                      nb_kernel_data_t * gmx_restrict        kernel_data,
1292                      t_nrnb * gmx_restrict                  nrnb)
1293 {
1294     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1295      * just 0 for non-waters.
1296      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1297      * jnr indices corresponding to data put in the four positions in the SIMD register.
1298      */
1299     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1300     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1301     int              jnrA,jnrB,jnrC,jnrD;
1302     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1303     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1304     real             shX,shY,shZ,rcutoff_scalar;
1305     real             *shiftvec,*fshift,*x,*f;
1306     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1307     int              vdwioffset0;
1308     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1309     int              vdwioffset1;
1310     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1311     int              vdwioffset2;
1312     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1313     int              vdwioffset3;
1314     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1315     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1316     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1317     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1318     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1319     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1320     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1321     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1322     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1323     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1324     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1325     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1326     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1327     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1328     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1329     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1330     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1331     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1332     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1333     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1334     real             *charge;
1335     int              nvdwtype;
1336     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1337     int              *vdwtype;
1338     real             *vdwparam;
1339     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1340     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1341     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1342     real             rswitch_scalar,d_scalar;
1343     __m128           dummy_mask,cutoff_mask;
1344     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1345     __m128           one     = _mm_set1_ps(1.0);
1346     __m128           two     = _mm_set1_ps(2.0);
1347     x                = xx[0];
1348     f                = ff[0];
1349
1350     nri              = nlist->nri;
1351     iinr             = nlist->iinr;
1352     jindex           = nlist->jindex;
1353     jjnr             = nlist->jjnr;
1354     shiftidx         = nlist->shift;
1355     gid              = nlist->gid;
1356     shiftvec         = fr->shift_vec[0];
1357     fshift           = fr->fshift[0];
1358     facel            = _mm_set1_ps(fr->epsfac);
1359     charge           = mdatoms->chargeA;
1360     krf              = _mm_set1_ps(fr->ic->k_rf);
1361     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1362     crf              = _mm_set1_ps(fr->ic->c_rf);
1363     nvdwtype         = fr->ntype;
1364     vdwparam         = fr->nbfp;
1365     vdwtype          = mdatoms->typeA;
1366
1367     /* Setup water-specific parameters */
1368     inr              = nlist->iinr[0];
1369     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1370     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1371     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1372     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1373
1374     jq1              = _mm_set1_ps(charge[inr+1]);
1375     jq2              = _mm_set1_ps(charge[inr+2]);
1376     jq3              = _mm_set1_ps(charge[inr+3]);
1377     vdwjidx0A        = 2*vdwtype[inr+0];
1378     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1379     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1380     qq11             = _mm_mul_ps(iq1,jq1);
1381     qq12             = _mm_mul_ps(iq1,jq2);
1382     qq13             = _mm_mul_ps(iq1,jq3);
1383     qq21             = _mm_mul_ps(iq2,jq1);
1384     qq22             = _mm_mul_ps(iq2,jq2);
1385     qq23             = _mm_mul_ps(iq2,jq3);
1386     qq31             = _mm_mul_ps(iq3,jq1);
1387     qq32             = _mm_mul_ps(iq3,jq2);
1388     qq33             = _mm_mul_ps(iq3,jq3);
1389
1390     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1391     rcutoff_scalar   = fr->rcoulomb;
1392     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1393     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1394
1395     rswitch_scalar   = fr->rvdw_switch;
1396     rswitch          = _mm_set1_ps(rswitch_scalar);
1397     /* Setup switch parameters */
1398     d_scalar         = rcutoff_scalar-rswitch_scalar;
1399     d                = _mm_set1_ps(d_scalar);
1400     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1401     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1402     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1403     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1404     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1405     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1406
1407     /* Avoid stupid compiler warnings */
1408     jnrA = jnrB = jnrC = jnrD = 0;
1409     j_coord_offsetA = 0;
1410     j_coord_offsetB = 0;
1411     j_coord_offsetC = 0;
1412     j_coord_offsetD = 0;
1413
1414     outeriter        = 0;
1415     inneriter        = 0;
1416
1417     /* Start outer loop over neighborlists */
1418     for(iidx=0; iidx<nri; iidx++)
1419     {
1420         /* Load shift vector for this list */
1421         i_shift_offset   = DIM*shiftidx[iidx];
1422         shX              = shiftvec[i_shift_offset+XX];
1423         shY              = shiftvec[i_shift_offset+YY];
1424         shZ              = shiftvec[i_shift_offset+ZZ];
1425
1426         /* Load limits for loop over neighbors */
1427         j_index_start    = jindex[iidx];
1428         j_index_end      = jindex[iidx+1];
1429
1430         /* Get outer coordinate index */
1431         inr              = iinr[iidx];
1432         i_coord_offset   = DIM*inr;
1433
1434         /* Load i particle coords and add shift vector */
1435         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1436         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1437         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1438         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1439         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1440         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1441         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1442         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1443         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1444         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1445         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1446         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1447
1448         fix0             = _mm_setzero_ps();
1449         fiy0             = _mm_setzero_ps();
1450         fiz0             = _mm_setzero_ps();
1451         fix1             = _mm_setzero_ps();
1452         fiy1             = _mm_setzero_ps();
1453         fiz1             = _mm_setzero_ps();
1454         fix2             = _mm_setzero_ps();
1455         fiy2             = _mm_setzero_ps();
1456         fiz2             = _mm_setzero_ps();
1457         fix3             = _mm_setzero_ps();
1458         fiy3             = _mm_setzero_ps();
1459         fiz3             = _mm_setzero_ps();
1460
1461         /* Start inner kernel loop */
1462         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1463         {
1464
1465             /* Get j neighbor index, and coordinate index */
1466             jnrA             = jjnr[jidx];
1467             jnrB             = jjnr[jidx+1];
1468             jnrC             = jjnr[jidx+2];
1469             jnrD             = jjnr[jidx+3];
1470
1471             j_coord_offsetA  = DIM*jnrA;
1472             j_coord_offsetB  = DIM*jnrB;
1473             j_coord_offsetC  = DIM*jnrC;
1474             j_coord_offsetD  = DIM*jnrD;
1475
1476             /* load j atom coordinates */
1477             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1478                                               x+j_coord_offsetC,x+j_coord_offsetD,
1479                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1480                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1481
1482             /* Calculate displacement vector */
1483             dx00             = _mm_sub_ps(ix0,jx0);
1484             dy00             = _mm_sub_ps(iy0,jy0);
1485             dz00             = _mm_sub_ps(iz0,jz0);
1486             dx11             = _mm_sub_ps(ix1,jx1);
1487             dy11             = _mm_sub_ps(iy1,jy1);
1488             dz11             = _mm_sub_ps(iz1,jz1);
1489             dx12             = _mm_sub_ps(ix1,jx2);
1490             dy12             = _mm_sub_ps(iy1,jy2);
1491             dz12             = _mm_sub_ps(iz1,jz2);
1492             dx13             = _mm_sub_ps(ix1,jx3);
1493             dy13             = _mm_sub_ps(iy1,jy3);
1494             dz13             = _mm_sub_ps(iz1,jz3);
1495             dx21             = _mm_sub_ps(ix2,jx1);
1496             dy21             = _mm_sub_ps(iy2,jy1);
1497             dz21             = _mm_sub_ps(iz2,jz1);
1498             dx22             = _mm_sub_ps(ix2,jx2);
1499             dy22             = _mm_sub_ps(iy2,jy2);
1500             dz22             = _mm_sub_ps(iz2,jz2);
1501             dx23             = _mm_sub_ps(ix2,jx3);
1502             dy23             = _mm_sub_ps(iy2,jy3);
1503             dz23             = _mm_sub_ps(iz2,jz3);
1504             dx31             = _mm_sub_ps(ix3,jx1);
1505             dy31             = _mm_sub_ps(iy3,jy1);
1506             dz31             = _mm_sub_ps(iz3,jz1);
1507             dx32             = _mm_sub_ps(ix3,jx2);
1508             dy32             = _mm_sub_ps(iy3,jy2);
1509             dz32             = _mm_sub_ps(iz3,jz2);
1510             dx33             = _mm_sub_ps(ix3,jx3);
1511             dy33             = _mm_sub_ps(iy3,jy3);
1512             dz33             = _mm_sub_ps(iz3,jz3);
1513
1514             /* Calculate squared distance and things based on it */
1515             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1516             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1517             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1518             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1519             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1520             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1521             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1522             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1523             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1524             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1525
1526             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1527             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1528             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1529             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1530             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1531             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1532             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1533             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1534             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1535             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1536
1537             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1538             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1539             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1540             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1541             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1542             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1543             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1544             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1545             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1546             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1547
1548             fjx0             = _mm_setzero_ps();
1549             fjy0             = _mm_setzero_ps();
1550             fjz0             = _mm_setzero_ps();
1551             fjx1             = _mm_setzero_ps();
1552             fjy1             = _mm_setzero_ps();
1553             fjz1             = _mm_setzero_ps();
1554             fjx2             = _mm_setzero_ps();
1555             fjy2             = _mm_setzero_ps();
1556             fjz2             = _mm_setzero_ps();
1557             fjx3             = _mm_setzero_ps();
1558             fjy3             = _mm_setzero_ps();
1559             fjz3             = _mm_setzero_ps();
1560
1561             /**************************
1562              * CALCULATE INTERACTIONS *
1563              **************************/
1564
1565             if (gmx_mm_any_lt(rsq00,rcutoff2))
1566             {
1567
1568             r00              = _mm_mul_ps(rsq00,rinv00);
1569
1570             /* LENNARD-JONES DISPERSION/REPULSION */
1571
1572             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1573             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1574             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1575             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1576             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1577
1578             d                = _mm_sub_ps(r00,rswitch);
1579             d                = _mm_max_ps(d,_mm_setzero_ps());
1580             d2               = _mm_mul_ps(d,d);
1581             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1582
1583             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1584
1585             /* Evaluate switch function */
1586             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1587             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1588             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1589
1590             fscal            = fvdw;
1591
1592             fscal            = _mm_and_ps(fscal,cutoff_mask);
1593
1594             /* Calculate temporary vectorial force */
1595             tx               = _mm_mul_ps(fscal,dx00);
1596             ty               = _mm_mul_ps(fscal,dy00);
1597             tz               = _mm_mul_ps(fscal,dz00);
1598
1599             /* Update vectorial force */
1600             fix0             = _mm_add_ps(fix0,tx);
1601             fiy0             = _mm_add_ps(fiy0,ty);
1602             fiz0             = _mm_add_ps(fiz0,tz);
1603
1604             fjx0             = _mm_add_ps(fjx0,tx);
1605             fjy0             = _mm_add_ps(fjy0,ty);
1606             fjz0             = _mm_add_ps(fjz0,tz);
1607
1608             }
1609
1610             /**************************
1611              * CALCULATE INTERACTIONS *
1612              **************************/
1613
1614             if (gmx_mm_any_lt(rsq11,rcutoff2))
1615             {
1616
1617             /* REACTION-FIELD ELECTROSTATICS */
1618             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1619
1620             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1621
1622             fscal            = felec;
1623
1624             fscal            = _mm_and_ps(fscal,cutoff_mask);
1625
1626             /* Calculate temporary vectorial force */
1627             tx               = _mm_mul_ps(fscal,dx11);
1628             ty               = _mm_mul_ps(fscal,dy11);
1629             tz               = _mm_mul_ps(fscal,dz11);
1630
1631             /* Update vectorial force */
1632             fix1             = _mm_add_ps(fix1,tx);
1633             fiy1             = _mm_add_ps(fiy1,ty);
1634             fiz1             = _mm_add_ps(fiz1,tz);
1635
1636             fjx1             = _mm_add_ps(fjx1,tx);
1637             fjy1             = _mm_add_ps(fjy1,ty);
1638             fjz1             = _mm_add_ps(fjz1,tz);
1639
1640             }
1641
1642             /**************************
1643              * CALCULATE INTERACTIONS *
1644              **************************/
1645
1646             if (gmx_mm_any_lt(rsq12,rcutoff2))
1647             {
1648
1649             /* REACTION-FIELD ELECTROSTATICS */
1650             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1651
1652             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1653
1654             fscal            = felec;
1655
1656             fscal            = _mm_and_ps(fscal,cutoff_mask);
1657
1658             /* Calculate temporary vectorial force */
1659             tx               = _mm_mul_ps(fscal,dx12);
1660             ty               = _mm_mul_ps(fscal,dy12);
1661             tz               = _mm_mul_ps(fscal,dz12);
1662
1663             /* Update vectorial force */
1664             fix1             = _mm_add_ps(fix1,tx);
1665             fiy1             = _mm_add_ps(fiy1,ty);
1666             fiz1             = _mm_add_ps(fiz1,tz);
1667
1668             fjx2             = _mm_add_ps(fjx2,tx);
1669             fjy2             = _mm_add_ps(fjy2,ty);
1670             fjz2             = _mm_add_ps(fjz2,tz);
1671
1672             }
1673
1674             /**************************
1675              * CALCULATE INTERACTIONS *
1676              **************************/
1677
1678             if (gmx_mm_any_lt(rsq13,rcutoff2))
1679             {
1680
1681             /* REACTION-FIELD ELECTROSTATICS */
1682             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1683
1684             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1685
1686             fscal            = felec;
1687
1688             fscal            = _mm_and_ps(fscal,cutoff_mask);
1689
1690             /* Calculate temporary vectorial force */
1691             tx               = _mm_mul_ps(fscal,dx13);
1692             ty               = _mm_mul_ps(fscal,dy13);
1693             tz               = _mm_mul_ps(fscal,dz13);
1694
1695             /* Update vectorial force */
1696             fix1             = _mm_add_ps(fix1,tx);
1697             fiy1             = _mm_add_ps(fiy1,ty);
1698             fiz1             = _mm_add_ps(fiz1,tz);
1699
1700             fjx3             = _mm_add_ps(fjx3,tx);
1701             fjy3             = _mm_add_ps(fjy3,ty);
1702             fjz3             = _mm_add_ps(fjz3,tz);
1703
1704             }
1705
1706             /**************************
1707              * CALCULATE INTERACTIONS *
1708              **************************/
1709
1710             if (gmx_mm_any_lt(rsq21,rcutoff2))
1711             {
1712
1713             /* REACTION-FIELD ELECTROSTATICS */
1714             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1715
1716             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1717
1718             fscal            = felec;
1719
1720             fscal            = _mm_and_ps(fscal,cutoff_mask);
1721
1722             /* Calculate temporary vectorial force */
1723             tx               = _mm_mul_ps(fscal,dx21);
1724             ty               = _mm_mul_ps(fscal,dy21);
1725             tz               = _mm_mul_ps(fscal,dz21);
1726
1727             /* Update vectorial force */
1728             fix2             = _mm_add_ps(fix2,tx);
1729             fiy2             = _mm_add_ps(fiy2,ty);
1730             fiz2             = _mm_add_ps(fiz2,tz);
1731
1732             fjx1             = _mm_add_ps(fjx1,tx);
1733             fjy1             = _mm_add_ps(fjy1,ty);
1734             fjz1             = _mm_add_ps(fjz1,tz);
1735
1736             }
1737
1738             /**************************
1739              * CALCULATE INTERACTIONS *
1740              **************************/
1741
1742             if (gmx_mm_any_lt(rsq22,rcutoff2))
1743             {
1744
1745             /* REACTION-FIELD ELECTROSTATICS */
1746             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1747
1748             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1749
1750             fscal            = felec;
1751
1752             fscal            = _mm_and_ps(fscal,cutoff_mask);
1753
1754             /* Calculate temporary vectorial force */
1755             tx               = _mm_mul_ps(fscal,dx22);
1756             ty               = _mm_mul_ps(fscal,dy22);
1757             tz               = _mm_mul_ps(fscal,dz22);
1758
1759             /* Update vectorial force */
1760             fix2             = _mm_add_ps(fix2,tx);
1761             fiy2             = _mm_add_ps(fiy2,ty);
1762             fiz2             = _mm_add_ps(fiz2,tz);
1763
1764             fjx2             = _mm_add_ps(fjx2,tx);
1765             fjy2             = _mm_add_ps(fjy2,ty);
1766             fjz2             = _mm_add_ps(fjz2,tz);
1767
1768             }
1769
1770             /**************************
1771              * CALCULATE INTERACTIONS *
1772              **************************/
1773
1774             if (gmx_mm_any_lt(rsq23,rcutoff2))
1775             {
1776
1777             /* REACTION-FIELD ELECTROSTATICS */
1778             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1779
1780             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1781
1782             fscal            = felec;
1783
1784             fscal            = _mm_and_ps(fscal,cutoff_mask);
1785
1786             /* Calculate temporary vectorial force */
1787             tx               = _mm_mul_ps(fscal,dx23);
1788             ty               = _mm_mul_ps(fscal,dy23);
1789             tz               = _mm_mul_ps(fscal,dz23);
1790
1791             /* Update vectorial force */
1792             fix2             = _mm_add_ps(fix2,tx);
1793             fiy2             = _mm_add_ps(fiy2,ty);
1794             fiz2             = _mm_add_ps(fiz2,tz);
1795
1796             fjx3             = _mm_add_ps(fjx3,tx);
1797             fjy3             = _mm_add_ps(fjy3,ty);
1798             fjz3             = _mm_add_ps(fjz3,tz);
1799
1800             }
1801
1802             /**************************
1803              * CALCULATE INTERACTIONS *
1804              **************************/
1805
1806             if (gmx_mm_any_lt(rsq31,rcutoff2))
1807             {
1808
1809             /* REACTION-FIELD ELECTROSTATICS */
1810             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1811
1812             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1813
1814             fscal            = felec;
1815
1816             fscal            = _mm_and_ps(fscal,cutoff_mask);
1817
1818             /* Calculate temporary vectorial force */
1819             tx               = _mm_mul_ps(fscal,dx31);
1820             ty               = _mm_mul_ps(fscal,dy31);
1821             tz               = _mm_mul_ps(fscal,dz31);
1822
1823             /* Update vectorial force */
1824             fix3             = _mm_add_ps(fix3,tx);
1825             fiy3             = _mm_add_ps(fiy3,ty);
1826             fiz3             = _mm_add_ps(fiz3,tz);
1827
1828             fjx1             = _mm_add_ps(fjx1,tx);
1829             fjy1             = _mm_add_ps(fjy1,ty);
1830             fjz1             = _mm_add_ps(fjz1,tz);
1831
1832             }
1833
1834             /**************************
1835              * CALCULATE INTERACTIONS *
1836              **************************/
1837
1838             if (gmx_mm_any_lt(rsq32,rcutoff2))
1839             {
1840
1841             /* REACTION-FIELD ELECTROSTATICS */
1842             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1843
1844             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1845
1846             fscal            = felec;
1847
1848             fscal            = _mm_and_ps(fscal,cutoff_mask);
1849
1850             /* Calculate temporary vectorial force */
1851             tx               = _mm_mul_ps(fscal,dx32);
1852             ty               = _mm_mul_ps(fscal,dy32);
1853             tz               = _mm_mul_ps(fscal,dz32);
1854
1855             /* Update vectorial force */
1856             fix3             = _mm_add_ps(fix3,tx);
1857             fiy3             = _mm_add_ps(fiy3,ty);
1858             fiz3             = _mm_add_ps(fiz3,tz);
1859
1860             fjx2             = _mm_add_ps(fjx2,tx);
1861             fjy2             = _mm_add_ps(fjy2,ty);
1862             fjz2             = _mm_add_ps(fjz2,tz);
1863
1864             }
1865
1866             /**************************
1867              * CALCULATE INTERACTIONS *
1868              **************************/
1869
1870             if (gmx_mm_any_lt(rsq33,rcutoff2))
1871             {
1872
1873             /* REACTION-FIELD ELECTROSTATICS */
1874             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1875
1876             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1877
1878             fscal            = felec;
1879
1880             fscal            = _mm_and_ps(fscal,cutoff_mask);
1881
1882             /* Calculate temporary vectorial force */
1883             tx               = _mm_mul_ps(fscal,dx33);
1884             ty               = _mm_mul_ps(fscal,dy33);
1885             tz               = _mm_mul_ps(fscal,dz33);
1886
1887             /* Update vectorial force */
1888             fix3             = _mm_add_ps(fix3,tx);
1889             fiy3             = _mm_add_ps(fiy3,ty);
1890             fiz3             = _mm_add_ps(fiz3,tz);
1891
1892             fjx3             = _mm_add_ps(fjx3,tx);
1893             fjy3             = _mm_add_ps(fjy3,ty);
1894             fjz3             = _mm_add_ps(fjz3,tz);
1895
1896             }
1897
1898             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1899                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1900                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1901                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1902
1903             /* Inner loop uses 329 flops */
1904         }
1905
1906         if(jidx<j_index_end)
1907         {
1908
1909             /* Get j neighbor index, and coordinate index */
1910             jnrA             = jjnr[jidx];
1911             jnrB             = jjnr[jidx+1];
1912             jnrC             = jjnr[jidx+2];
1913             jnrD             = jjnr[jidx+3];
1914
1915             /* Sign of each element will be negative for non-real atoms.
1916              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1917              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1918              */
1919             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1920             jnrA       = (jnrA>=0) ? jnrA : 0;
1921             jnrB       = (jnrB>=0) ? jnrB : 0;
1922             jnrC       = (jnrC>=0) ? jnrC : 0;
1923             jnrD       = (jnrD>=0) ? jnrD : 0;
1924
1925             j_coord_offsetA  = DIM*jnrA;
1926             j_coord_offsetB  = DIM*jnrB;
1927             j_coord_offsetC  = DIM*jnrC;
1928             j_coord_offsetD  = DIM*jnrD;
1929
1930             /* load j atom coordinates */
1931             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1932                                               x+j_coord_offsetC,x+j_coord_offsetD,
1933                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1934                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1935
1936             /* Calculate displacement vector */
1937             dx00             = _mm_sub_ps(ix0,jx0);
1938             dy00             = _mm_sub_ps(iy0,jy0);
1939             dz00             = _mm_sub_ps(iz0,jz0);
1940             dx11             = _mm_sub_ps(ix1,jx1);
1941             dy11             = _mm_sub_ps(iy1,jy1);
1942             dz11             = _mm_sub_ps(iz1,jz1);
1943             dx12             = _mm_sub_ps(ix1,jx2);
1944             dy12             = _mm_sub_ps(iy1,jy2);
1945             dz12             = _mm_sub_ps(iz1,jz2);
1946             dx13             = _mm_sub_ps(ix1,jx3);
1947             dy13             = _mm_sub_ps(iy1,jy3);
1948             dz13             = _mm_sub_ps(iz1,jz3);
1949             dx21             = _mm_sub_ps(ix2,jx1);
1950             dy21             = _mm_sub_ps(iy2,jy1);
1951             dz21             = _mm_sub_ps(iz2,jz1);
1952             dx22             = _mm_sub_ps(ix2,jx2);
1953             dy22             = _mm_sub_ps(iy2,jy2);
1954             dz22             = _mm_sub_ps(iz2,jz2);
1955             dx23             = _mm_sub_ps(ix2,jx3);
1956             dy23             = _mm_sub_ps(iy2,jy3);
1957             dz23             = _mm_sub_ps(iz2,jz3);
1958             dx31             = _mm_sub_ps(ix3,jx1);
1959             dy31             = _mm_sub_ps(iy3,jy1);
1960             dz31             = _mm_sub_ps(iz3,jz1);
1961             dx32             = _mm_sub_ps(ix3,jx2);
1962             dy32             = _mm_sub_ps(iy3,jy2);
1963             dz32             = _mm_sub_ps(iz3,jz2);
1964             dx33             = _mm_sub_ps(ix3,jx3);
1965             dy33             = _mm_sub_ps(iy3,jy3);
1966             dz33             = _mm_sub_ps(iz3,jz3);
1967
1968             /* Calculate squared distance and things based on it */
1969             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1970             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1971             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1972             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1973             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1974             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1975             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1976             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1977             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1978             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1979
1980             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1981             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1982             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1983             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1984             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1985             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1986             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1987             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1988             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1989             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1990
1991             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1992             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1993             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1994             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1995             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1996             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1997             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1998             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1999             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2000             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2001
2002             fjx0             = _mm_setzero_ps();
2003             fjy0             = _mm_setzero_ps();
2004             fjz0             = _mm_setzero_ps();
2005             fjx1             = _mm_setzero_ps();
2006             fjy1             = _mm_setzero_ps();
2007             fjz1             = _mm_setzero_ps();
2008             fjx2             = _mm_setzero_ps();
2009             fjy2             = _mm_setzero_ps();
2010             fjz2             = _mm_setzero_ps();
2011             fjx3             = _mm_setzero_ps();
2012             fjy3             = _mm_setzero_ps();
2013             fjz3             = _mm_setzero_ps();
2014
2015             /**************************
2016              * CALCULATE INTERACTIONS *
2017              **************************/
2018
2019             if (gmx_mm_any_lt(rsq00,rcutoff2))
2020             {
2021
2022             r00              = _mm_mul_ps(rsq00,rinv00);
2023             r00              = _mm_andnot_ps(dummy_mask,r00);
2024
2025             /* LENNARD-JONES DISPERSION/REPULSION */
2026
2027             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2028             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2029             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2030             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2031             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2032
2033             d                = _mm_sub_ps(r00,rswitch);
2034             d                = _mm_max_ps(d,_mm_setzero_ps());
2035             d2               = _mm_mul_ps(d,d);
2036             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2037
2038             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2039
2040             /* Evaluate switch function */
2041             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2042             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2043             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2044
2045             fscal            = fvdw;
2046
2047             fscal            = _mm_and_ps(fscal,cutoff_mask);
2048
2049             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2050
2051             /* Calculate temporary vectorial force */
2052             tx               = _mm_mul_ps(fscal,dx00);
2053             ty               = _mm_mul_ps(fscal,dy00);
2054             tz               = _mm_mul_ps(fscal,dz00);
2055
2056             /* Update vectorial force */
2057             fix0             = _mm_add_ps(fix0,tx);
2058             fiy0             = _mm_add_ps(fiy0,ty);
2059             fiz0             = _mm_add_ps(fiz0,tz);
2060
2061             fjx0             = _mm_add_ps(fjx0,tx);
2062             fjy0             = _mm_add_ps(fjy0,ty);
2063             fjz0             = _mm_add_ps(fjz0,tz);
2064
2065             }
2066
2067             /**************************
2068              * CALCULATE INTERACTIONS *
2069              **************************/
2070
2071             if (gmx_mm_any_lt(rsq11,rcutoff2))
2072             {
2073
2074             /* REACTION-FIELD ELECTROSTATICS */
2075             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2076
2077             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2078
2079             fscal            = felec;
2080
2081             fscal            = _mm_and_ps(fscal,cutoff_mask);
2082
2083             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2084
2085             /* Calculate temporary vectorial force */
2086             tx               = _mm_mul_ps(fscal,dx11);
2087             ty               = _mm_mul_ps(fscal,dy11);
2088             tz               = _mm_mul_ps(fscal,dz11);
2089
2090             /* Update vectorial force */
2091             fix1             = _mm_add_ps(fix1,tx);
2092             fiy1             = _mm_add_ps(fiy1,ty);
2093             fiz1             = _mm_add_ps(fiz1,tz);
2094
2095             fjx1             = _mm_add_ps(fjx1,tx);
2096             fjy1             = _mm_add_ps(fjy1,ty);
2097             fjz1             = _mm_add_ps(fjz1,tz);
2098
2099             }
2100
2101             /**************************
2102              * CALCULATE INTERACTIONS *
2103              **************************/
2104
2105             if (gmx_mm_any_lt(rsq12,rcutoff2))
2106             {
2107
2108             /* REACTION-FIELD ELECTROSTATICS */
2109             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2110
2111             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2112
2113             fscal            = felec;
2114
2115             fscal            = _mm_and_ps(fscal,cutoff_mask);
2116
2117             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2118
2119             /* Calculate temporary vectorial force */
2120             tx               = _mm_mul_ps(fscal,dx12);
2121             ty               = _mm_mul_ps(fscal,dy12);
2122             tz               = _mm_mul_ps(fscal,dz12);
2123
2124             /* Update vectorial force */
2125             fix1             = _mm_add_ps(fix1,tx);
2126             fiy1             = _mm_add_ps(fiy1,ty);
2127             fiz1             = _mm_add_ps(fiz1,tz);
2128
2129             fjx2             = _mm_add_ps(fjx2,tx);
2130             fjy2             = _mm_add_ps(fjy2,ty);
2131             fjz2             = _mm_add_ps(fjz2,tz);
2132
2133             }
2134
2135             /**************************
2136              * CALCULATE INTERACTIONS *
2137              **************************/
2138
2139             if (gmx_mm_any_lt(rsq13,rcutoff2))
2140             {
2141
2142             /* REACTION-FIELD ELECTROSTATICS */
2143             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2144
2145             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2146
2147             fscal            = felec;
2148
2149             fscal            = _mm_and_ps(fscal,cutoff_mask);
2150
2151             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2152
2153             /* Calculate temporary vectorial force */
2154             tx               = _mm_mul_ps(fscal,dx13);
2155             ty               = _mm_mul_ps(fscal,dy13);
2156             tz               = _mm_mul_ps(fscal,dz13);
2157
2158             /* Update vectorial force */
2159             fix1             = _mm_add_ps(fix1,tx);
2160             fiy1             = _mm_add_ps(fiy1,ty);
2161             fiz1             = _mm_add_ps(fiz1,tz);
2162
2163             fjx3             = _mm_add_ps(fjx3,tx);
2164             fjy3             = _mm_add_ps(fjy3,ty);
2165             fjz3             = _mm_add_ps(fjz3,tz);
2166
2167             }
2168
2169             /**************************
2170              * CALCULATE INTERACTIONS *
2171              **************************/
2172
2173             if (gmx_mm_any_lt(rsq21,rcutoff2))
2174             {
2175
2176             /* REACTION-FIELD ELECTROSTATICS */
2177             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2178
2179             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2180
2181             fscal            = felec;
2182
2183             fscal            = _mm_and_ps(fscal,cutoff_mask);
2184
2185             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2186
2187             /* Calculate temporary vectorial force */
2188             tx               = _mm_mul_ps(fscal,dx21);
2189             ty               = _mm_mul_ps(fscal,dy21);
2190             tz               = _mm_mul_ps(fscal,dz21);
2191
2192             /* Update vectorial force */
2193             fix2             = _mm_add_ps(fix2,tx);
2194             fiy2             = _mm_add_ps(fiy2,ty);
2195             fiz2             = _mm_add_ps(fiz2,tz);
2196
2197             fjx1             = _mm_add_ps(fjx1,tx);
2198             fjy1             = _mm_add_ps(fjy1,ty);
2199             fjz1             = _mm_add_ps(fjz1,tz);
2200
2201             }
2202
2203             /**************************
2204              * CALCULATE INTERACTIONS *
2205              **************************/
2206
2207             if (gmx_mm_any_lt(rsq22,rcutoff2))
2208             {
2209
2210             /* REACTION-FIELD ELECTROSTATICS */
2211             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2212
2213             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2214
2215             fscal            = felec;
2216
2217             fscal            = _mm_and_ps(fscal,cutoff_mask);
2218
2219             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2220
2221             /* Calculate temporary vectorial force */
2222             tx               = _mm_mul_ps(fscal,dx22);
2223             ty               = _mm_mul_ps(fscal,dy22);
2224             tz               = _mm_mul_ps(fscal,dz22);
2225
2226             /* Update vectorial force */
2227             fix2             = _mm_add_ps(fix2,tx);
2228             fiy2             = _mm_add_ps(fiy2,ty);
2229             fiz2             = _mm_add_ps(fiz2,tz);
2230
2231             fjx2             = _mm_add_ps(fjx2,tx);
2232             fjy2             = _mm_add_ps(fjy2,ty);
2233             fjz2             = _mm_add_ps(fjz2,tz);
2234
2235             }
2236
2237             /**************************
2238              * CALCULATE INTERACTIONS *
2239              **************************/
2240
2241             if (gmx_mm_any_lt(rsq23,rcutoff2))
2242             {
2243
2244             /* REACTION-FIELD ELECTROSTATICS */
2245             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2246
2247             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2248
2249             fscal            = felec;
2250
2251             fscal            = _mm_and_ps(fscal,cutoff_mask);
2252
2253             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2254
2255             /* Calculate temporary vectorial force */
2256             tx               = _mm_mul_ps(fscal,dx23);
2257             ty               = _mm_mul_ps(fscal,dy23);
2258             tz               = _mm_mul_ps(fscal,dz23);
2259
2260             /* Update vectorial force */
2261             fix2             = _mm_add_ps(fix2,tx);
2262             fiy2             = _mm_add_ps(fiy2,ty);
2263             fiz2             = _mm_add_ps(fiz2,tz);
2264
2265             fjx3             = _mm_add_ps(fjx3,tx);
2266             fjy3             = _mm_add_ps(fjy3,ty);
2267             fjz3             = _mm_add_ps(fjz3,tz);
2268
2269             }
2270
2271             /**************************
2272              * CALCULATE INTERACTIONS *
2273              **************************/
2274
2275             if (gmx_mm_any_lt(rsq31,rcutoff2))
2276             {
2277
2278             /* REACTION-FIELD ELECTROSTATICS */
2279             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2280
2281             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2282
2283             fscal            = felec;
2284
2285             fscal            = _mm_and_ps(fscal,cutoff_mask);
2286
2287             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2288
2289             /* Calculate temporary vectorial force */
2290             tx               = _mm_mul_ps(fscal,dx31);
2291             ty               = _mm_mul_ps(fscal,dy31);
2292             tz               = _mm_mul_ps(fscal,dz31);
2293
2294             /* Update vectorial force */
2295             fix3             = _mm_add_ps(fix3,tx);
2296             fiy3             = _mm_add_ps(fiy3,ty);
2297             fiz3             = _mm_add_ps(fiz3,tz);
2298
2299             fjx1             = _mm_add_ps(fjx1,tx);
2300             fjy1             = _mm_add_ps(fjy1,ty);
2301             fjz1             = _mm_add_ps(fjz1,tz);
2302
2303             }
2304
2305             /**************************
2306              * CALCULATE INTERACTIONS *
2307              **************************/
2308
2309             if (gmx_mm_any_lt(rsq32,rcutoff2))
2310             {
2311
2312             /* REACTION-FIELD ELECTROSTATICS */
2313             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2314
2315             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2316
2317             fscal            = felec;
2318
2319             fscal            = _mm_and_ps(fscal,cutoff_mask);
2320
2321             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2322
2323             /* Calculate temporary vectorial force */
2324             tx               = _mm_mul_ps(fscal,dx32);
2325             ty               = _mm_mul_ps(fscal,dy32);
2326             tz               = _mm_mul_ps(fscal,dz32);
2327
2328             /* Update vectorial force */
2329             fix3             = _mm_add_ps(fix3,tx);
2330             fiy3             = _mm_add_ps(fiy3,ty);
2331             fiz3             = _mm_add_ps(fiz3,tz);
2332
2333             fjx2             = _mm_add_ps(fjx2,tx);
2334             fjy2             = _mm_add_ps(fjy2,ty);
2335             fjz2             = _mm_add_ps(fjz2,tz);
2336
2337             }
2338
2339             /**************************
2340              * CALCULATE INTERACTIONS *
2341              **************************/
2342
2343             if (gmx_mm_any_lt(rsq33,rcutoff2))
2344             {
2345
2346             /* REACTION-FIELD ELECTROSTATICS */
2347             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2348
2349             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2350
2351             fscal            = felec;
2352
2353             fscal            = _mm_and_ps(fscal,cutoff_mask);
2354
2355             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2356
2357             /* Calculate temporary vectorial force */
2358             tx               = _mm_mul_ps(fscal,dx33);
2359             ty               = _mm_mul_ps(fscal,dy33);
2360             tz               = _mm_mul_ps(fscal,dz33);
2361
2362             /* Update vectorial force */
2363             fix3             = _mm_add_ps(fix3,tx);
2364             fiy3             = _mm_add_ps(fiy3,ty);
2365             fiz3             = _mm_add_ps(fiz3,tz);
2366
2367             fjx3             = _mm_add_ps(fjx3,tx);
2368             fjy3             = _mm_add_ps(fjy3,ty);
2369             fjz3             = _mm_add_ps(fjz3,tz);
2370
2371             }
2372
2373             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2374                                                    f+j_coord_offsetC,f+j_coord_offsetD,
2375                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2376                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2377
2378             /* Inner loop uses 330 flops */
2379         }
2380
2381         /* End of innermost loop */
2382
2383         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2384                                               f+i_coord_offset,fshift+i_shift_offset);
2385
2386         /* Increment number of inner iterations */
2387         inneriter                  += j_index_end - j_index_start;
2388
2389         /* Outer loop uses 36 flops */
2390     }
2391
2392     /* Increment number of outer iterations */
2393     outeriter        += nri;
2394
2395     /* Update outer/inner flops */
2396
2397     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*330);
2398 }