Rename remaining GMX_ACCELERATION to GMX_CPU_ACCELERATION
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
94     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
95     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
96     real             rswitch_scalar,d_scalar;
97     __m128           dummy_mask,cutoff_mask;
98     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
99     __m128           one     = _mm_set1_ps(1.0);
100     __m128           two     = _mm_set1_ps(2.0);
101     x                = xx[0];
102     f                = ff[0];
103
104     nri              = nlist->nri;
105     iinr             = nlist->iinr;
106     jindex           = nlist->jindex;
107     jjnr             = nlist->jjnr;
108     shiftidx         = nlist->shift;
109     gid              = nlist->gid;
110     shiftvec         = fr->shift_vec[0];
111     fshift           = fr->fshift[0];
112     facel            = _mm_set1_ps(fr->epsfac);
113     charge           = mdatoms->chargeA;
114     krf              = _mm_set1_ps(fr->ic->k_rf);
115     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
116     crf              = _mm_set1_ps(fr->ic->c_rf);
117     nvdwtype         = fr->ntype;
118     vdwparam         = fr->nbfp;
119     vdwtype          = mdatoms->typeA;
120
121     /* Setup water-specific parameters */
122     inr              = nlist->iinr[0];
123     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
124     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
125     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
126     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
127
128     jq0              = _mm_set1_ps(charge[inr+0]);
129     jq1              = _mm_set1_ps(charge[inr+1]);
130     jq2              = _mm_set1_ps(charge[inr+2]);
131     vdwjidx0A        = 2*vdwtype[inr+0];
132     qq00             = _mm_mul_ps(iq0,jq0);
133     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
134     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
135     qq01             = _mm_mul_ps(iq0,jq1);
136     qq02             = _mm_mul_ps(iq0,jq2);
137     qq10             = _mm_mul_ps(iq1,jq0);
138     qq11             = _mm_mul_ps(iq1,jq1);
139     qq12             = _mm_mul_ps(iq1,jq2);
140     qq20             = _mm_mul_ps(iq2,jq0);
141     qq21             = _mm_mul_ps(iq2,jq1);
142     qq22             = _mm_mul_ps(iq2,jq2);
143
144     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
145     rcutoff_scalar   = fr->rcoulomb;
146     rcutoff          = _mm_set1_ps(rcutoff_scalar);
147     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
148
149     rswitch_scalar   = fr->rvdw_switch;
150     rswitch          = _mm_set1_ps(rswitch_scalar);
151     /* Setup switch parameters */
152     d_scalar         = rcutoff_scalar-rswitch_scalar;
153     d                = _mm_set1_ps(d_scalar);
154     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
155     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
156     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
157     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
158     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
159     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
160
161     /* Avoid stupid compiler warnings */
162     jnrA = jnrB = jnrC = jnrD = 0;
163     j_coord_offsetA = 0;
164     j_coord_offsetB = 0;
165     j_coord_offsetC = 0;
166     j_coord_offsetD = 0;
167
168     outeriter        = 0;
169     inneriter        = 0;
170
171     /* Start outer loop over neighborlists */
172     for(iidx=0; iidx<nri; iidx++)
173     {
174         /* Load shift vector for this list */
175         i_shift_offset   = DIM*shiftidx[iidx];
176         shX              = shiftvec[i_shift_offset+XX];
177         shY              = shiftvec[i_shift_offset+YY];
178         shZ              = shiftvec[i_shift_offset+ZZ];
179
180         /* Load limits for loop over neighbors */
181         j_index_start    = jindex[iidx];
182         j_index_end      = jindex[iidx+1];
183
184         /* Get outer coordinate index */
185         inr              = iinr[iidx];
186         i_coord_offset   = DIM*inr;
187
188         /* Load i particle coords and add shift vector */
189         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
190         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
191         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
192         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
193         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
194         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
195         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
196         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
197         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
198
199         fix0             = _mm_setzero_ps();
200         fiy0             = _mm_setzero_ps();
201         fiz0             = _mm_setzero_ps();
202         fix1             = _mm_setzero_ps();
203         fiy1             = _mm_setzero_ps();
204         fiz1             = _mm_setzero_ps();
205         fix2             = _mm_setzero_ps();
206         fiy2             = _mm_setzero_ps();
207         fiz2             = _mm_setzero_ps();
208
209         /* Reset potential sums */
210         velecsum         = _mm_setzero_ps();
211         vvdwsum          = _mm_setzero_ps();
212
213         /* Start inner kernel loop */
214         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215         {
216
217             /* Get j neighbor index, and coordinate index */
218             jnrA             = jjnr[jidx];
219             jnrB             = jjnr[jidx+1];
220             jnrC             = jjnr[jidx+2];
221             jnrD             = jjnr[jidx+3];
222
223             j_coord_offsetA  = DIM*jnrA;
224             j_coord_offsetB  = DIM*jnrB;
225             j_coord_offsetC  = DIM*jnrC;
226             j_coord_offsetD  = DIM*jnrD;
227
228             /* load j atom coordinates */
229             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
230                                               x+j_coord_offsetC,x+j_coord_offsetD,
231                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
232
233             /* Calculate displacement vector */
234             dx00             = _mm_sub_ps(ix0,jx0);
235             dy00             = _mm_sub_ps(iy0,jy0);
236             dz00             = _mm_sub_ps(iz0,jz0);
237             dx01             = _mm_sub_ps(ix0,jx1);
238             dy01             = _mm_sub_ps(iy0,jy1);
239             dz01             = _mm_sub_ps(iz0,jz1);
240             dx02             = _mm_sub_ps(ix0,jx2);
241             dy02             = _mm_sub_ps(iy0,jy2);
242             dz02             = _mm_sub_ps(iz0,jz2);
243             dx10             = _mm_sub_ps(ix1,jx0);
244             dy10             = _mm_sub_ps(iy1,jy0);
245             dz10             = _mm_sub_ps(iz1,jz0);
246             dx11             = _mm_sub_ps(ix1,jx1);
247             dy11             = _mm_sub_ps(iy1,jy1);
248             dz11             = _mm_sub_ps(iz1,jz1);
249             dx12             = _mm_sub_ps(ix1,jx2);
250             dy12             = _mm_sub_ps(iy1,jy2);
251             dz12             = _mm_sub_ps(iz1,jz2);
252             dx20             = _mm_sub_ps(ix2,jx0);
253             dy20             = _mm_sub_ps(iy2,jy0);
254             dz20             = _mm_sub_ps(iz2,jz0);
255             dx21             = _mm_sub_ps(ix2,jx1);
256             dy21             = _mm_sub_ps(iy2,jy1);
257             dz21             = _mm_sub_ps(iz2,jz1);
258             dx22             = _mm_sub_ps(ix2,jx2);
259             dy22             = _mm_sub_ps(iy2,jy2);
260             dz22             = _mm_sub_ps(iz2,jz2);
261
262             /* Calculate squared distance and things based on it */
263             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
264             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
265             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
266             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
267             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
270             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
272
273             rinv00           = gmx_mm_invsqrt_ps(rsq00);
274             rinv01           = gmx_mm_invsqrt_ps(rsq01);
275             rinv02           = gmx_mm_invsqrt_ps(rsq02);
276             rinv10           = gmx_mm_invsqrt_ps(rsq10);
277             rinv11           = gmx_mm_invsqrt_ps(rsq11);
278             rinv12           = gmx_mm_invsqrt_ps(rsq12);
279             rinv20           = gmx_mm_invsqrt_ps(rsq20);
280             rinv21           = gmx_mm_invsqrt_ps(rsq21);
281             rinv22           = gmx_mm_invsqrt_ps(rsq22);
282
283             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
284             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
285             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
286             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
287             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
288             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
289             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
290             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
291             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
292
293             fjx0             = _mm_setzero_ps();
294             fjy0             = _mm_setzero_ps();
295             fjz0             = _mm_setzero_ps();
296             fjx1             = _mm_setzero_ps();
297             fjy1             = _mm_setzero_ps();
298             fjz1             = _mm_setzero_ps();
299             fjx2             = _mm_setzero_ps();
300             fjy2             = _mm_setzero_ps();
301             fjz2             = _mm_setzero_ps();
302
303             /**************************
304              * CALCULATE INTERACTIONS *
305              **************************/
306
307             if (gmx_mm_any_lt(rsq00,rcutoff2))
308             {
309
310             r00              = _mm_mul_ps(rsq00,rinv00);
311
312             /* REACTION-FIELD ELECTROSTATICS */
313             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
314             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
315
316             /* LENNARD-JONES DISPERSION/REPULSION */
317
318             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
319             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
320             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
321             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
322             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
323
324             d                = _mm_sub_ps(r00,rswitch);
325             d                = _mm_max_ps(d,_mm_setzero_ps());
326             d2               = _mm_mul_ps(d,d);
327             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
328
329             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
330
331             /* Evaluate switch function */
332             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
333             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
334             vvdw             = _mm_mul_ps(vvdw,sw);
335             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
336
337             /* Update potential sum for this i atom from the interaction with this j atom. */
338             velec            = _mm_and_ps(velec,cutoff_mask);
339             velecsum         = _mm_add_ps(velecsum,velec);
340             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
341             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
342
343             fscal            = _mm_add_ps(felec,fvdw);
344
345             fscal            = _mm_and_ps(fscal,cutoff_mask);
346
347             /* Calculate temporary vectorial force */
348             tx               = _mm_mul_ps(fscal,dx00);
349             ty               = _mm_mul_ps(fscal,dy00);
350             tz               = _mm_mul_ps(fscal,dz00);
351
352             /* Update vectorial force */
353             fix0             = _mm_add_ps(fix0,tx);
354             fiy0             = _mm_add_ps(fiy0,ty);
355             fiz0             = _mm_add_ps(fiz0,tz);
356
357             fjx0             = _mm_add_ps(fjx0,tx);
358             fjy0             = _mm_add_ps(fjy0,ty);
359             fjz0             = _mm_add_ps(fjz0,tz);
360
361             }
362
363             /**************************
364              * CALCULATE INTERACTIONS *
365              **************************/
366
367             if (gmx_mm_any_lt(rsq01,rcutoff2))
368             {
369
370             /* REACTION-FIELD ELECTROSTATICS */
371             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
372             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
373
374             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
375
376             /* Update potential sum for this i atom from the interaction with this j atom. */
377             velec            = _mm_and_ps(velec,cutoff_mask);
378             velecsum         = _mm_add_ps(velecsum,velec);
379
380             fscal            = felec;
381
382             fscal            = _mm_and_ps(fscal,cutoff_mask);
383
384             /* Calculate temporary vectorial force */
385             tx               = _mm_mul_ps(fscal,dx01);
386             ty               = _mm_mul_ps(fscal,dy01);
387             tz               = _mm_mul_ps(fscal,dz01);
388
389             /* Update vectorial force */
390             fix0             = _mm_add_ps(fix0,tx);
391             fiy0             = _mm_add_ps(fiy0,ty);
392             fiz0             = _mm_add_ps(fiz0,tz);
393
394             fjx1             = _mm_add_ps(fjx1,tx);
395             fjy1             = _mm_add_ps(fjy1,ty);
396             fjz1             = _mm_add_ps(fjz1,tz);
397
398             }
399
400             /**************************
401              * CALCULATE INTERACTIONS *
402              **************************/
403
404             if (gmx_mm_any_lt(rsq02,rcutoff2))
405             {
406
407             /* REACTION-FIELD ELECTROSTATICS */
408             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
409             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
410
411             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
412
413             /* Update potential sum for this i atom from the interaction with this j atom. */
414             velec            = _mm_and_ps(velec,cutoff_mask);
415             velecsum         = _mm_add_ps(velecsum,velec);
416
417             fscal            = felec;
418
419             fscal            = _mm_and_ps(fscal,cutoff_mask);
420
421             /* Calculate temporary vectorial force */
422             tx               = _mm_mul_ps(fscal,dx02);
423             ty               = _mm_mul_ps(fscal,dy02);
424             tz               = _mm_mul_ps(fscal,dz02);
425
426             /* Update vectorial force */
427             fix0             = _mm_add_ps(fix0,tx);
428             fiy0             = _mm_add_ps(fiy0,ty);
429             fiz0             = _mm_add_ps(fiz0,tz);
430
431             fjx2             = _mm_add_ps(fjx2,tx);
432             fjy2             = _mm_add_ps(fjy2,ty);
433             fjz2             = _mm_add_ps(fjz2,tz);
434
435             }
436
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             if (gmx_mm_any_lt(rsq10,rcutoff2))
442             {
443
444             /* REACTION-FIELD ELECTROSTATICS */
445             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
446             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
447
448             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
449
450             /* Update potential sum for this i atom from the interaction with this j atom. */
451             velec            = _mm_and_ps(velec,cutoff_mask);
452             velecsum         = _mm_add_ps(velecsum,velec);
453
454             fscal            = felec;
455
456             fscal            = _mm_and_ps(fscal,cutoff_mask);
457
458             /* Calculate temporary vectorial force */
459             tx               = _mm_mul_ps(fscal,dx10);
460             ty               = _mm_mul_ps(fscal,dy10);
461             tz               = _mm_mul_ps(fscal,dz10);
462
463             /* Update vectorial force */
464             fix1             = _mm_add_ps(fix1,tx);
465             fiy1             = _mm_add_ps(fiy1,ty);
466             fiz1             = _mm_add_ps(fiz1,tz);
467
468             fjx0             = _mm_add_ps(fjx0,tx);
469             fjy0             = _mm_add_ps(fjy0,ty);
470             fjz0             = _mm_add_ps(fjz0,tz);
471
472             }
473
474             /**************************
475              * CALCULATE INTERACTIONS *
476              **************************/
477
478             if (gmx_mm_any_lt(rsq11,rcutoff2))
479             {
480
481             /* REACTION-FIELD ELECTROSTATICS */
482             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
483             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
484
485             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
486
487             /* Update potential sum for this i atom from the interaction with this j atom. */
488             velec            = _mm_and_ps(velec,cutoff_mask);
489             velecsum         = _mm_add_ps(velecsum,velec);
490
491             fscal            = felec;
492
493             fscal            = _mm_and_ps(fscal,cutoff_mask);
494
495             /* Calculate temporary vectorial force */
496             tx               = _mm_mul_ps(fscal,dx11);
497             ty               = _mm_mul_ps(fscal,dy11);
498             tz               = _mm_mul_ps(fscal,dz11);
499
500             /* Update vectorial force */
501             fix1             = _mm_add_ps(fix1,tx);
502             fiy1             = _mm_add_ps(fiy1,ty);
503             fiz1             = _mm_add_ps(fiz1,tz);
504
505             fjx1             = _mm_add_ps(fjx1,tx);
506             fjy1             = _mm_add_ps(fjy1,ty);
507             fjz1             = _mm_add_ps(fjz1,tz);
508
509             }
510
511             /**************************
512              * CALCULATE INTERACTIONS *
513              **************************/
514
515             if (gmx_mm_any_lt(rsq12,rcutoff2))
516             {
517
518             /* REACTION-FIELD ELECTROSTATICS */
519             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
520             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
521
522             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
523
524             /* Update potential sum for this i atom from the interaction with this j atom. */
525             velec            = _mm_and_ps(velec,cutoff_mask);
526             velecsum         = _mm_add_ps(velecsum,velec);
527
528             fscal            = felec;
529
530             fscal            = _mm_and_ps(fscal,cutoff_mask);
531
532             /* Calculate temporary vectorial force */
533             tx               = _mm_mul_ps(fscal,dx12);
534             ty               = _mm_mul_ps(fscal,dy12);
535             tz               = _mm_mul_ps(fscal,dz12);
536
537             /* Update vectorial force */
538             fix1             = _mm_add_ps(fix1,tx);
539             fiy1             = _mm_add_ps(fiy1,ty);
540             fiz1             = _mm_add_ps(fiz1,tz);
541
542             fjx2             = _mm_add_ps(fjx2,tx);
543             fjy2             = _mm_add_ps(fjy2,ty);
544             fjz2             = _mm_add_ps(fjz2,tz);
545
546             }
547
548             /**************************
549              * CALCULATE INTERACTIONS *
550              **************************/
551
552             if (gmx_mm_any_lt(rsq20,rcutoff2))
553             {
554
555             /* REACTION-FIELD ELECTROSTATICS */
556             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
557             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
558
559             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
560
561             /* Update potential sum for this i atom from the interaction with this j atom. */
562             velec            = _mm_and_ps(velec,cutoff_mask);
563             velecsum         = _mm_add_ps(velecsum,velec);
564
565             fscal            = felec;
566
567             fscal            = _mm_and_ps(fscal,cutoff_mask);
568
569             /* Calculate temporary vectorial force */
570             tx               = _mm_mul_ps(fscal,dx20);
571             ty               = _mm_mul_ps(fscal,dy20);
572             tz               = _mm_mul_ps(fscal,dz20);
573
574             /* Update vectorial force */
575             fix2             = _mm_add_ps(fix2,tx);
576             fiy2             = _mm_add_ps(fiy2,ty);
577             fiz2             = _mm_add_ps(fiz2,tz);
578
579             fjx0             = _mm_add_ps(fjx0,tx);
580             fjy0             = _mm_add_ps(fjy0,ty);
581             fjz0             = _mm_add_ps(fjz0,tz);
582
583             }
584
585             /**************************
586              * CALCULATE INTERACTIONS *
587              **************************/
588
589             if (gmx_mm_any_lt(rsq21,rcutoff2))
590             {
591
592             /* REACTION-FIELD ELECTROSTATICS */
593             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
594             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
595
596             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
597
598             /* Update potential sum for this i atom from the interaction with this j atom. */
599             velec            = _mm_and_ps(velec,cutoff_mask);
600             velecsum         = _mm_add_ps(velecsum,velec);
601
602             fscal            = felec;
603
604             fscal            = _mm_and_ps(fscal,cutoff_mask);
605
606             /* Calculate temporary vectorial force */
607             tx               = _mm_mul_ps(fscal,dx21);
608             ty               = _mm_mul_ps(fscal,dy21);
609             tz               = _mm_mul_ps(fscal,dz21);
610
611             /* Update vectorial force */
612             fix2             = _mm_add_ps(fix2,tx);
613             fiy2             = _mm_add_ps(fiy2,ty);
614             fiz2             = _mm_add_ps(fiz2,tz);
615
616             fjx1             = _mm_add_ps(fjx1,tx);
617             fjy1             = _mm_add_ps(fjy1,ty);
618             fjz1             = _mm_add_ps(fjz1,tz);
619
620             }
621
622             /**************************
623              * CALCULATE INTERACTIONS *
624              **************************/
625
626             if (gmx_mm_any_lt(rsq22,rcutoff2))
627             {
628
629             /* REACTION-FIELD ELECTROSTATICS */
630             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
631             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
632
633             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
634
635             /* Update potential sum for this i atom from the interaction with this j atom. */
636             velec            = _mm_and_ps(velec,cutoff_mask);
637             velecsum         = _mm_add_ps(velecsum,velec);
638
639             fscal            = felec;
640
641             fscal            = _mm_and_ps(fscal,cutoff_mask);
642
643             /* Calculate temporary vectorial force */
644             tx               = _mm_mul_ps(fscal,dx22);
645             ty               = _mm_mul_ps(fscal,dy22);
646             tz               = _mm_mul_ps(fscal,dz22);
647
648             /* Update vectorial force */
649             fix2             = _mm_add_ps(fix2,tx);
650             fiy2             = _mm_add_ps(fiy2,ty);
651             fiz2             = _mm_add_ps(fiz2,tz);
652
653             fjx2             = _mm_add_ps(fjx2,tx);
654             fjy2             = _mm_add_ps(fjy2,ty);
655             fjz2             = _mm_add_ps(fjz2,tz);
656
657             }
658
659             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
660                                                    f+j_coord_offsetC,f+j_coord_offsetD,
661                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
662
663             /* Inner loop uses 358 flops */
664         }
665
666         if(jidx<j_index_end)
667         {
668
669             /* Get j neighbor index, and coordinate index */
670             jnrA             = jjnr[jidx];
671             jnrB             = jjnr[jidx+1];
672             jnrC             = jjnr[jidx+2];
673             jnrD             = jjnr[jidx+3];
674
675             /* Sign of each element will be negative for non-real atoms.
676              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
677              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
678              */
679             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
680             jnrA       = (jnrA>=0) ? jnrA : 0;
681             jnrB       = (jnrB>=0) ? jnrB : 0;
682             jnrC       = (jnrC>=0) ? jnrC : 0;
683             jnrD       = (jnrD>=0) ? jnrD : 0;
684
685             j_coord_offsetA  = DIM*jnrA;
686             j_coord_offsetB  = DIM*jnrB;
687             j_coord_offsetC  = DIM*jnrC;
688             j_coord_offsetD  = DIM*jnrD;
689
690             /* load j atom coordinates */
691             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
692                                               x+j_coord_offsetC,x+j_coord_offsetD,
693                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
694
695             /* Calculate displacement vector */
696             dx00             = _mm_sub_ps(ix0,jx0);
697             dy00             = _mm_sub_ps(iy0,jy0);
698             dz00             = _mm_sub_ps(iz0,jz0);
699             dx01             = _mm_sub_ps(ix0,jx1);
700             dy01             = _mm_sub_ps(iy0,jy1);
701             dz01             = _mm_sub_ps(iz0,jz1);
702             dx02             = _mm_sub_ps(ix0,jx2);
703             dy02             = _mm_sub_ps(iy0,jy2);
704             dz02             = _mm_sub_ps(iz0,jz2);
705             dx10             = _mm_sub_ps(ix1,jx0);
706             dy10             = _mm_sub_ps(iy1,jy0);
707             dz10             = _mm_sub_ps(iz1,jz0);
708             dx11             = _mm_sub_ps(ix1,jx1);
709             dy11             = _mm_sub_ps(iy1,jy1);
710             dz11             = _mm_sub_ps(iz1,jz1);
711             dx12             = _mm_sub_ps(ix1,jx2);
712             dy12             = _mm_sub_ps(iy1,jy2);
713             dz12             = _mm_sub_ps(iz1,jz2);
714             dx20             = _mm_sub_ps(ix2,jx0);
715             dy20             = _mm_sub_ps(iy2,jy0);
716             dz20             = _mm_sub_ps(iz2,jz0);
717             dx21             = _mm_sub_ps(ix2,jx1);
718             dy21             = _mm_sub_ps(iy2,jy1);
719             dz21             = _mm_sub_ps(iz2,jz1);
720             dx22             = _mm_sub_ps(ix2,jx2);
721             dy22             = _mm_sub_ps(iy2,jy2);
722             dz22             = _mm_sub_ps(iz2,jz2);
723
724             /* Calculate squared distance and things based on it */
725             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
726             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
727             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
728             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
729             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
730             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
731             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
732             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
733             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
734
735             rinv00           = gmx_mm_invsqrt_ps(rsq00);
736             rinv01           = gmx_mm_invsqrt_ps(rsq01);
737             rinv02           = gmx_mm_invsqrt_ps(rsq02);
738             rinv10           = gmx_mm_invsqrt_ps(rsq10);
739             rinv11           = gmx_mm_invsqrt_ps(rsq11);
740             rinv12           = gmx_mm_invsqrt_ps(rsq12);
741             rinv20           = gmx_mm_invsqrt_ps(rsq20);
742             rinv21           = gmx_mm_invsqrt_ps(rsq21);
743             rinv22           = gmx_mm_invsqrt_ps(rsq22);
744
745             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
746             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
747             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
748             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
749             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
750             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
751             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
752             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
753             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
754
755             fjx0             = _mm_setzero_ps();
756             fjy0             = _mm_setzero_ps();
757             fjz0             = _mm_setzero_ps();
758             fjx1             = _mm_setzero_ps();
759             fjy1             = _mm_setzero_ps();
760             fjz1             = _mm_setzero_ps();
761             fjx2             = _mm_setzero_ps();
762             fjy2             = _mm_setzero_ps();
763             fjz2             = _mm_setzero_ps();
764
765             /**************************
766              * CALCULATE INTERACTIONS *
767              **************************/
768
769             if (gmx_mm_any_lt(rsq00,rcutoff2))
770             {
771
772             r00              = _mm_mul_ps(rsq00,rinv00);
773             r00              = _mm_andnot_ps(dummy_mask,r00);
774
775             /* REACTION-FIELD ELECTROSTATICS */
776             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
777             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
778
779             /* LENNARD-JONES DISPERSION/REPULSION */
780
781             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
782             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
783             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
784             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
785             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
786
787             d                = _mm_sub_ps(r00,rswitch);
788             d                = _mm_max_ps(d,_mm_setzero_ps());
789             d2               = _mm_mul_ps(d,d);
790             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
791
792             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
793
794             /* Evaluate switch function */
795             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
797             vvdw             = _mm_mul_ps(vvdw,sw);
798             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
799
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm_and_ps(velec,cutoff_mask);
802             velec            = _mm_andnot_ps(dummy_mask,velec);
803             velecsum         = _mm_add_ps(velecsum,velec);
804             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
805             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
806             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
807
808             fscal            = _mm_add_ps(felec,fvdw);
809
810             fscal            = _mm_and_ps(fscal,cutoff_mask);
811
812             fscal            = _mm_andnot_ps(dummy_mask,fscal);
813
814             /* Calculate temporary vectorial force */
815             tx               = _mm_mul_ps(fscal,dx00);
816             ty               = _mm_mul_ps(fscal,dy00);
817             tz               = _mm_mul_ps(fscal,dz00);
818
819             /* Update vectorial force */
820             fix0             = _mm_add_ps(fix0,tx);
821             fiy0             = _mm_add_ps(fiy0,ty);
822             fiz0             = _mm_add_ps(fiz0,tz);
823
824             fjx0             = _mm_add_ps(fjx0,tx);
825             fjy0             = _mm_add_ps(fjy0,ty);
826             fjz0             = _mm_add_ps(fjz0,tz);
827
828             }
829
830             /**************************
831              * CALCULATE INTERACTIONS *
832              **************************/
833
834             if (gmx_mm_any_lt(rsq01,rcutoff2))
835             {
836
837             /* REACTION-FIELD ELECTROSTATICS */
838             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
839             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
840
841             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
842
843             /* Update potential sum for this i atom from the interaction with this j atom. */
844             velec            = _mm_and_ps(velec,cutoff_mask);
845             velec            = _mm_andnot_ps(dummy_mask,velec);
846             velecsum         = _mm_add_ps(velecsum,velec);
847
848             fscal            = felec;
849
850             fscal            = _mm_and_ps(fscal,cutoff_mask);
851
852             fscal            = _mm_andnot_ps(dummy_mask,fscal);
853
854             /* Calculate temporary vectorial force */
855             tx               = _mm_mul_ps(fscal,dx01);
856             ty               = _mm_mul_ps(fscal,dy01);
857             tz               = _mm_mul_ps(fscal,dz01);
858
859             /* Update vectorial force */
860             fix0             = _mm_add_ps(fix0,tx);
861             fiy0             = _mm_add_ps(fiy0,ty);
862             fiz0             = _mm_add_ps(fiz0,tz);
863
864             fjx1             = _mm_add_ps(fjx1,tx);
865             fjy1             = _mm_add_ps(fjy1,ty);
866             fjz1             = _mm_add_ps(fjz1,tz);
867
868             }
869
870             /**************************
871              * CALCULATE INTERACTIONS *
872              **************************/
873
874             if (gmx_mm_any_lt(rsq02,rcutoff2))
875             {
876
877             /* REACTION-FIELD ELECTROSTATICS */
878             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
879             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
880
881             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
882
883             /* Update potential sum for this i atom from the interaction with this j atom. */
884             velec            = _mm_and_ps(velec,cutoff_mask);
885             velec            = _mm_andnot_ps(dummy_mask,velec);
886             velecsum         = _mm_add_ps(velecsum,velec);
887
888             fscal            = felec;
889
890             fscal            = _mm_and_ps(fscal,cutoff_mask);
891
892             fscal            = _mm_andnot_ps(dummy_mask,fscal);
893
894             /* Calculate temporary vectorial force */
895             tx               = _mm_mul_ps(fscal,dx02);
896             ty               = _mm_mul_ps(fscal,dy02);
897             tz               = _mm_mul_ps(fscal,dz02);
898
899             /* Update vectorial force */
900             fix0             = _mm_add_ps(fix0,tx);
901             fiy0             = _mm_add_ps(fiy0,ty);
902             fiz0             = _mm_add_ps(fiz0,tz);
903
904             fjx2             = _mm_add_ps(fjx2,tx);
905             fjy2             = _mm_add_ps(fjy2,ty);
906             fjz2             = _mm_add_ps(fjz2,tz);
907
908             }
909
910             /**************************
911              * CALCULATE INTERACTIONS *
912              **************************/
913
914             if (gmx_mm_any_lt(rsq10,rcutoff2))
915             {
916
917             /* REACTION-FIELD ELECTROSTATICS */
918             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
919             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
920
921             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
922
923             /* Update potential sum for this i atom from the interaction with this j atom. */
924             velec            = _mm_and_ps(velec,cutoff_mask);
925             velec            = _mm_andnot_ps(dummy_mask,velec);
926             velecsum         = _mm_add_ps(velecsum,velec);
927
928             fscal            = felec;
929
930             fscal            = _mm_and_ps(fscal,cutoff_mask);
931
932             fscal            = _mm_andnot_ps(dummy_mask,fscal);
933
934             /* Calculate temporary vectorial force */
935             tx               = _mm_mul_ps(fscal,dx10);
936             ty               = _mm_mul_ps(fscal,dy10);
937             tz               = _mm_mul_ps(fscal,dz10);
938
939             /* Update vectorial force */
940             fix1             = _mm_add_ps(fix1,tx);
941             fiy1             = _mm_add_ps(fiy1,ty);
942             fiz1             = _mm_add_ps(fiz1,tz);
943
944             fjx0             = _mm_add_ps(fjx0,tx);
945             fjy0             = _mm_add_ps(fjy0,ty);
946             fjz0             = _mm_add_ps(fjz0,tz);
947
948             }
949
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             if (gmx_mm_any_lt(rsq11,rcutoff2))
955             {
956
957             /* REACTION-FIELD ELECTROSTATICS */
958             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
959             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
960
961             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
962
963             /* Update potential sum for this i atom from the interaction with this j atom. */
964             velec            = _mm_and_ps(velec,cutoff_mask);
965             velec            = _mm_andnot_ps(dummy_mask,velec);
966             velecsum         = _mm_add_ps(velecsum,velec);
967
968             fscal            = felec;
969
970             fscal            = _mm_and_ps(fscal,cutoff_mask);
971
972             fscal            = _mm_andnot_ps(dummy_mask,fscal);
973
974             /* Calculate temporary vectorial force */
975             tx               = _mm_mul_ps(fscal,dx11);
976             ty               = _mm_mul_ps(fscal,dy11);
977             tz               = _mm_mul_ps(fscal,dz11);
978
979             /* Update vectorial force */
980             fix1             = _mm_add_ps(fix1,tx);
981             fiy1             = _mm_add_ps(fiy1,ty);
982             fiz1             = _mm_add_ps(fiz1,tz);
983
984             fjx1             = _mm_add_ps(fjx1,tx);
985             fjy1             = _mm_add_ps(fjy1,ty);
986             fjz1             = _mm_add_ps(fjz1,tz);
987
988             }
989
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             if (gmx_mm_any_lt(rsq12,rcutoff2))
995             {
996
997             /* REACTION-FIELD ELECTROSTATICS */
998             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
999             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1000
1001             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1002
1003             /* Update potential sum for this i atom from the interaction with this j atom. */
1004             velec            = _mm_and_ps(velec,cutoff_mask);
1005             velec            = _mm_andnot_ps(dummy_mask,velec);
1006             velecsum         = _mm_add_ps(velecsum,velec);
1007
1008             fscal            = felec;
1009
1010             fscal            = _mm_and_ps(fscal,cutoff_mask);
1011
1012             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1013
1014             /* Calculate temporary vectorial force */
1015             tx               = _mm_mul_ps(fscal,dx12);
1016             ty               = _mm_mul_ps(fscal,dy12);
1017             tz               = _mm_mul_ps(fscal,dz12);
1018
1019             /* Update vectorial force */
1020             fix1             = _mm_add_ps(fix1,tx);
1021             fiy1             = _mm_add_ps(fiy1,ty);
1022             fiz1             = _mm_add_ps(fiz1,tz);
1023
1024             fjx2             = _mm_add_ps(fjx2,tx);
1025             fjy2             = _mm_add_ps(fjy2,ty);
1026             fjz2             = _mm_add_ps(fjz2,tz);
1027
1028             }
1029
1030             /**************************
1031              * CALCULATE INTERACTIONS *
1032              **************************/
1033
1034             if (gmx_mm_any_lt(rsq20,rcutoff2))
1035             {
1036
1037             /* REACTION-FIELD ELECTROSTATICS */
1038             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1039             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1040
1041             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1042
1043             /* Update potential sum for this i atom from the interaction with this j atom. */
1044             velec            = _mm_and_ps(velec,cutoff_mask);
1045             velec            = _mm_andnot_ps(dummy_mask,velec);
1046             velecsum         = _mm_add_ps(velecsum,velec);
1047
1048             fscal            = felec;
1049
1050             fscal            = _mm_and_ps(fscal,cutoff_mask);
1051
1052             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1053
1054             /* Calculate temporary vectorial force */
1055             tx               = _mm_mul_ps(fscal,dx20);
1056             ty               = _mm_mul_ps(fscal,dy20);
1057             tz               = _mm_mul_ps(fscal,dz20);
1058
1059             /* Update vectorial force */
1060             fix2             = _mm_add_ps(fix2,tx);
1061             fiy2             = _mm_add_ps(fiy2,ty);
1062             fiz2             = _mm_add_ps(fiz2,tz);
1063
1064             fjx0             = _mm_add_ps(fjx0,tx);
1065             fjy0             = _mm_add_ps(fjy0,ty);
1066             fjz0             = _mm_add_ps(fjz0,tz);
1067
1068             }
1069
1070             /**************************
1071              * CALCULATE INTERACTIONS *
1072              **************************/
1073
1074             if (gmx_mm_any_lt(rsq21,rcutoff2))
1075             {
1076
1077             /* REACTION-FIELD ELECTROSTATICS */
1078             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1079             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1080
1081             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1082
1083             /* Update potential sum for this i atom from the interaction with this j atom. */
1084             velec            = _mm_and_ps(velec,cutoff_mask);
1085             velec            = _mm_andnot_ps(dummy_mask,velec);
1086             velecsum         = _mm_add_ps(velecsum,velec);
1087
1088             fscal            = felec;
1089
1090             fscal            = _mm_and_ps(fscal,cutoff_mask);
1091
1092             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1093
1094             /* Calculate temporary vectorial force */
1095             tx               = _mm_mul_ps(fscal,dx21);
1096             ty               = _mm_mul_ps(fscal,dy21);
1097             tz               = _mm_mul_ps(fscal,dz21);
1098
1099             /* Update vectorial force */
1100             fix2             = _mm_add_ps(fix2,tx);
1101             fiy2             = _mm_add_ps(fiy2,ty);
1102             fiz2             = _mm_add_ps(fiz2,tz);
1103
1104             fjx1             = _mm_add_ps(fjx1,tx);
1105             fjy1             = _mm_add_ps(fjy1,ty);
1106             fjz1             = _mm_add_ps(fjz1,tz);
1107
1108             }
1109
1110             /**************************
1111              * CALCULATE INTERACTIONS *
1112              **************************/
1113
1114             if (gmx_mm_any_lt(rsq22,rcutoff2))
1115             {
1116
1117             /* REACTION-FIELD ELECTROSTATICS */
1118             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1119             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1120
1121             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1122
1123             /* Update potential sum for this i atom from the interaction with this j atom. */
1124             velec            = _mm_and_ps(velec,cutoff_mask);
1125             velec            = _mm_andnot_ps(dummy_mask,velec);
1126             velecsum         = _mm_add_ps(velecsum,velec);
1127
1128             fscal            = felec;
1129
1130             fscal            = _mm_and_ps(fscal,cutoff_mask);
1131
1132             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1133
1134             /* Calculate temporary vectorial force */
1135             tx               = _mm_mul_ps(fscal,dx22);
1136             ty               = _mm_mul_ps(fscal,dy22);
1137             tz               = _mm_mul_ps(fscal,dz22);
1138
1139             /* Update vectorial force */
1140             fix2             = _mm_add_ps(fix2,tx);
1141             fiy2             = _mm_add_ps(fiy2,ty);
1142             fiz2             = _mm_add_ps(fiz2,tz);
1143
1144             fjx2             = _mm_add_ps(fjx2,tx);
1145             fjy2             = _mm_add_ps(fjy2,ty);
1146             fjz2             = _mm_add_ps(fjz2,tz);
1147
1148             }
1149
1150             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1151                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1152                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1153
1154             /* Inner loop uses 359 flops */
1155         }
1156
1157         /* End of innermost loop */
1158
1159         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1160                                               f+i_coord_offset,fshift+i_shift_offset);
1161
1162         ggid                        = gid[iidx];
1163         /* Update potential energies */
1164         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1165         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1166
1167         /* Increment number of inner iterations */
1168         inneriter                  += j_index_end - j_index_start;
1169
1170         /* Outer loop uses 29 flops */
1171     }
1172
1173     /* Increment number of outer iterations */
1174     outeriter        += nri;
1175
1176     /* Update outer/inner flops */
1177
1178     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*359);
1179 }
1180 /*
1181  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
1182  * Electrostatics interaction: ReactionField
1183  * VdW interaction:            LennardJones
1184  * Geometry:                   Water3-Water3
1185  * Calculate force/pot:        Force
1186  */
1187 void
1188 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
1189                     (t_nblist * gmx_restrict                nlist,
1190                      rvec * gmx_restrict                    xx,
1191                      rvec * gmx_restrict                    ff,
1192                      t_forcerec * gmx_restrict              fr,
1193                      t_mdatoms * gmx_restrict               mdatoms,
1194                      nb_kernel_data_t * gmx_restrict        kernel_data,
1195                      t_nrnb * gmx_restrict                  nrnb)
1196 {
1197     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1198      * just 0 for non-waters.
1199      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1200      * jnr indices corresponding to data put in the four positions in the SIMD register.
1201      */
1202     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1203     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1204     int              jnrA,jnrB,jnrC,jnrD;
1205     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1206     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1207     real             shX,shY,shZ,rcutoff_scalar;
1208     real             *shiftvec,*fshift,*x,*f;
1209     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1210     int              vdwioffset0;
1211     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1212     int              vdwioffset1;
1213     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1214     int              vdwioffset2;
1215     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1216     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1217     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1218     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1219     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1220     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1221     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1222     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1223     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1224     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1225     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1226     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1227     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1228     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1229     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1230     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1231     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1232     real             *charge;
1233     int              nvdwtype;
1234     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1235     int              *vdwtype;
1236     real             *vdwparam;
1237     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1238     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1239     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1240     real             rswitch_scalar,d_scalar;
1241     __m128           dummy_mask,cutoff_mask;
1242     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1243     __m128           one     = _mm_set1_ps(1.0);
1244     __m128           two     = _mm_set1_ps(2.0);
1245     x                = xx[0];
1246     f                = ff[0];
1247
1248     nri              = nlist->nri;
1249     iinr             = nlist->iinr;
1250     jindex           = nlist->jindex;
1251     jjnr             = nlist->jjnr;
1252     shiftidx         = nlist->shift;
1253     gid              = nlist->gid;
1254     shiftvec         = fr->shift_vec[0];
1255     fshift           = fr->fshift[0];
1256     facel            = _mm_set1_ps(fr->epsfac);
1257     charge           = mdatoms->chargeA;
1258     krf              = _mm_set1_ps(fr->ic->k_rf);
1259     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1260     crf              = _mm_set1_ps(fr->ic->c_rf);
1261     nvdwtype         = fr->ntype;
1262     vdwparam         = fr->nbfp;
1263     vdwtype          = mdatoms->typeA;
1264
1265     /* Setup water-specific parameters */
1266     inr              = nlist->iinr[0];
1267     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1268     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1269     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1270     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1271
1272     jq0              = _mm_set1_ps(charge[inr+0]);
1273     jq1              = _mm_set1_ps(charge[inr+1]);
1274     jq2              = _mm_set1_ps(charge[inr+2]);
1275     vdwjidx0A        = 2*vdwtype[inr+0];
1276     qq00             = _mm_mul_ps(iq0,jq0);
1277     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1278     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1279     qq01             = _mm_mul_ps(iq0,jq1);
1280     qq02             = _mm_mul_ps(iq0,jq2);
1281     qq10             = _mm_mul_ps(iq1,jq0);
1282     qq11             = _mm_mul_ps(iq1,jq1);
1283     qq12             = _mm_mul_ps(iq1,jq2);
1284     qq20             = _mm_mul_ps(iq2,jq0);
1285     qq21             = _mm_mul_ps(iq2,jq1);
1286     qq22             = _mm_mul_ps(iq2,jq2);
1287
1288     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1289     rcutoff_scalar   = fr->rcoulomb;
1290     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1291     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1292
1293     rswitch_scalar   = fr->rvdw_switch;
1294     rswitch          = _mm_set1_ps(rswitch_scalar);
1295     /* Setup switch parameters */
1296     d_scalar         = rcutoff_scalar-rswitch_scalar;
1297     d                = _mm_set1_ps(d_scalar);
1298     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1299     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1300     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1301     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1302     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1303     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1304
1305     /* Avoid stupid compiler warnings */
1306     jnrA = jnrB = jnrC = jnrD = 0;
1307     j_coord_offsetA = 0;
1308     j_coord_offsetB = 0;
1309     j_coord_offsetC = 0;
1310     j_coord_offsetD = 0;
1311
1312     outeriter        = 0;
1313     inneriter        = 0;
1314
1315     /* Start outer loop over neighborlists */
1316     for(iidx=0; iidx<nri; iidx++)
1317     {
1318         /* Load shift vector for this list */
1319         i_shift_offset   = DIM*shiftidx[iidx];
1320         shX              = shiftvec[i_shift_offset+XX];
1321         shY              = shiftvec[i_shift_offset+YY];
1322         shZ              = shiftvec[i_shift_offset+ZZ];
1323
1324         /* Load limits for loop over neighbors */
1325         j_index_start    = jindex[iidx];
1326         j_index_end      = jindex[iidx+1];
1327
1328         /* Get outer coordinate index */
1329         inr              = iinr[iidx];
1330         i_coord_offset   = DIM*inr;
1331
1332         /* Load i particle coords and add shift vector */
1333         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1334         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1335         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1336         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1337         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1338         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1339         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1340         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1341         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1342
1343         fix0             = _mm_setzero_ps();
1344         fiy0             = _mm_setzero_ps();
1345         fiz0             = _mm_setzero_ps();
1346         fix1             = _mm_setzero_ps();
1347         fiy1             = _mm_setzero_ps();
1348         fiz1             = _mm_setzero_ps();
1349         fix2             = _mm_setzero_ps();
1350         fiy2             = _mm_setzero_ps();
1351         fiz2             = _mm_setzero_ps();
1352
1353         /* Start inner kernel loop */
1354         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1355         {
1356
1357             /* Get j neighbor index, and coordinate index */
1358             jnrA             = jjnr[jidx];
1359             jnrB             = jjnr[jidx+1];
1360             jnrC             = jjnr[jidx+2];
1361             jnrD             = jjnr[jidx+3];
1362
1363             j_coord_offsetA  = DIM*jnrA;
1364             j_coord_offsetB  = DIM*jnrB;
1365             j_coord_offsetC  = DIM*jnrC;
1366             j_coord_offsetD  = DIM*jnrD;
1367
1368             /* load j atom coordinates */
1369             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1370                                               x+j_coord_offsetC,x+j_coord_offsetD,
1371                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1372
1373             /* Calculate displacement vector */
1374             dx00             = _mm_sub_ps(ix0,jx0);
1375             dy00             = _mm_sub_ps(iy0,jy0);
1376             dz00             = _mm_sub_ps(iz0,jz0);
1377             dx01             = _mm_sub_ps(ix0,jx1);
1378             dy01             = _mm_sub_ps(iy0,jy1);
1379             dz01             = _mm_sub_ps(iz0,jz1);
1380             dx02             = _mm_sub_ps(ix0,jx2);
1381             dy02             = _mm_sub_ps(iy0,jy2);
1382             dz02             = _mm_sub_ps(iz0,jz2);
1383             dx10             = _mm_sub_ps(ix1,jx0);
1384             dy10             = _mm_sub_ps(iy1,jy0);
1385             dz10             = _mm_sub_ps(iz1,jz0);
1386             dx11             = _mm_sub_ps(ix1,jx1);
1387             dy11             = _mm_sub_ps(iy1,jy1);
1388             dz11             = _mm_sub_ps(iz1,jz1);
1389             dx12             = _mm_sub_ps(ix1,jx2);
1390             dy12             = _mm_sub_ps(iy1,jy2);
1391             dz12             = _mm_sub_ps(iz1,jz2);
1392             dx20             = _mm_sub_ps(ix2,jx0);
1393             dy20             = _mm_sub_ps(iy2,jy0);
1394             dz20             = _mm_sub_ps(iz2,jz0);
1395             dx21             = _mm_sub_ps(ix2,jx1);
1396             dy21             = _mm_sub_ps(iy2,jy1);
1397             dz21             = _mm_sub_ps(iz2,jz1);
1398             dx22             = _mm_sub_ps(ix2,jx2);
1399             dy22             = _mm_sub_ps(iy2,jy2);
1400             dz22             = _mm_sub_ps(iz2,jz2);
1401
1402             /* Calculate squared distance and things based on it */
1403             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1404             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1405             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1406             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1407             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1408             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1409             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1410             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1411             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1412
1413             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1414             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1415             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1416             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1417             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1418             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1419             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1420             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1421             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1422
1423             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1424             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1425             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1426             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1427             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1428             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1429             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1430             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1431             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1432
1433             fjx0             = _mm_setzero_ps();
1434             fjy0             = _mm_setzero_ps();
1435             fjz0             = _mm_setzero_ps();
1436             fjx1             = _mm_setzero_ps();
1437             fjy1             = _mm_setzero_ps();
1438             fjz1             = _mm_setzero_ps();
1439             fjx2             = _mm_setzero_ps();
1440             fjy2             = _mm_setzero_ps();
1441             fjz2             = _mm_setzero_ps();
1442
1443             /**************************
1444              * CALCULATE INTERACTIONS *
1445              **************************/
1446
1447             if (gmx_mm_any_lt(rsq00,rcutoff2))
1448             {
1449
1450             r00              = _mm_mul_ps(rsq00,rinv00);
1451
1452             /* REACTION-FIELD ELECTROSTATICS */
1453             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1454
1455             /* LENNARD-JONES DISPERSION/REPULSION */
1456
1457             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1458             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1459             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1460             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1461             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1462
1463             d                = _mm_sub_ps(r00,rswitch);
1464             d                = _mm_max_ps(d,_mm_setzero_ps());
1465             d2               = _mm_mul_ps(d,d);
1466             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1467
1468             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1469
1470             /* Evaluate switch function */
1471             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1472             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1473             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1474
1475             fscal            = _mm_add_ps(felec,fvdw);
1476
1477             fscal            = _mm_and_ps(fscal,cutoff_mask);
1478
1479             /* Calculate temporary vectorial force */
1480             tx               = _mm_mul_ps(fscal,dx00);
1481             ty               = _mm_mul_ps(fscal,dy00);
1482             tz               = _mm_mul_ps(fscal,dz00);
1483
1484             /* Update vectorial force */
1485             fix0             = _mm_add_ps(fix0,tx);
1486             fiy0             = _mm_add_ps(fiy0,ty);
1487             fiz0             = _mm_add_ps(fiz0,tz);
1488
1489             fjx0             = _mm_add_ps(fjx0,tx);
1490             fjy0             = _mm_add_ps(fjy0,ty);
1491             fjz0             = _mm_add_ps(fjz0,tz);
1492
1493             }
1494
1495             /**************************
1496              * CALCULATE INTERACTIONS *
1497              **************************/
1498
1499             if (gmx_mm_any_lt(rsq01,rcutoff2))
1500             {
1501
1502             /* REACTION-FIELD ELECTROSTATICS */
1503             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1504
1505             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1506
1507             fscal            = felec;
1508
1509             fscal            = _mm_and_ps(fscal,cutoff_mask);
1510
1511             /* Calculate temporary vectorial force */
1512             tx               = _mm_mul_ps(fscal,dx01);
1513             ty               = _mm_mul_ps(fscal,dy01);
1514             tz               = _mm_mul_ps(fscal,dz01);
1515
1516             /* Update vectorial force */
1517             fix0             = _mm_add_ps(fix0,tx);
1518             fiy0             = _mm_add_ps(fiy0,ty);
1519             fiz0             = _mm_add_ps(fiz0,tz);
1520
1521             fjx1             = _mm_add_ps(fjx1,tx);
1522             fjy1             = _mm_add_ps(fjy1,ty);
1523             fjz1             = _mm_add_ps(fjz1,tz);
1524
1525             }
1526
1527             /**************************
1528              * CALCULATE INTERACTIONS *
1529              **************************/
1530
1531             if (gmx_mm_any_lt(rsq02,rcutoff2))
1532             {
1533
1534             /* REACTION-FIELD ELECTROSTATICS */
1535             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1536
1537             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1538
1539             fscal            = felec;
1540
1541             fscal            = _mm_and_ps(fscal,cutoff_mask);
1542
1543             /* Calculate temporary vectorial force */
1544             tx               = _mm_mul_ps(fscal,dx02);
1545             ty               = _mm_mul_ps(fscal,dy02);
1546             tz               = _mm_mul_ps(fscal,dz02);
1547
1548             /* Update vectorial force */
1549             fix0             = _mm_add_ps(fix0,tx);
1550             fiy0             = _mm_add_ps(fiy0,ty);
1551             fiz0             = _mm_add_ps(fiz0,tz);
1552
1553             fjx2             = _mm_add_ps(fjx2,tx);
1554             fjy2             = _mm_add_ps(fjy2,ty);
1555             fjz2             = _mm_add_ps(fjz2,tz);
1556
1557             }
1558
1559             /**************************
1560              * CALCULATE INTERACTIONS *
1561              **************************/
1562
1563             if (gmx_mm_any_lt(rsq10,rcutoff2))
1564             {
1565
1566             /* REACTION-FIELD ELECTROSTATICS */
1567             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1568
1569             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1570
1571             fscal            = felec;
1572
1573             fscal            = _mm_and_ps(fscal,cutoff_mask);
1574
1575             /* Calculate temporary vectorial force */
1576             tx               = _mm_mul_ps(fscal,dx10);
1577             ty               = _mm_mul_ps(fscal,dy10);
1578             tz               = _mm_mul_ps(fscal,dz10);
1579
1580             /* Update vectorial force */
1581             fix1             = _mm_add_ps(fix1,tx);
1582             fiy1             = _mm_add_ps(fiy1,ty);
1583             fiz1             = _mm_add_ps(fiz1,tz);
1584
1585             fjx0             = _mm_add_ps(fjx0,tx);
1586             fjy0             = _mm_add_ps(fjy0,ty);
1587             fjz0             = _mm_add_ps(fjz0,tz);
1588
1589             }
1590
1591             /**************************
1592              * CALCULATE INTERACTIONS *
1593              **************************/
1594
1595             if (gmx_mm_any_lt(rsq11,rcutoff2))
1596             {
1597
1598             /* REACTION-FIELD ELECTROSTATICS */
1599             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1600
1601             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1602
1603             fscal            = felec;
1604
1605             fscal            = _mm_and_ps(fscal,cutoff_mask);
1606
1607             /* Calculate temporary vectorial force */
1608             tx               = _mm_mul_ps(fscal,dx11);
1609             ty               = _mm_mul_ps(fscal,dy11);
1610             tz               = _mm_mul_ps(fscal,dz11);
1611
1612             /* Update vectorial force */
1613             fix1             = _mm_add_ps(fix1,tx);
1614             fiy1             = _mm_add_ps(fiy1,ty);
1615             fiz1             = _mm_add_ps(fiz1,tz);
1616
1617             fjx1             = _mm_add_ps(fjx1,tx);
1618             fjy1             = _mm_add_ps(fjy1,ty);
1619             fjz1             = _mm_add_ps(fjz1,tz);
1620
1621             }
1622
1623             /**************************
1624              * CALCULATE INTERACTIONS *
1625              **************************/
1626
1627             if (gmx_mm_any_lt(rsq12,rcutoff2))
1628             {
1629
1630             /* REACTION-FIELD ELECTROSTATICS */
1631             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1632
1633             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1634
1635             fscal            = felec;
1636
1637             fscal            = _mm_and_ps(fscal,cutoff_mask);
1638
1639             /* Calculate temporary vectorial force */
1640             tx               = _mm_mul_ps(fscal,dx12);
1641             ty               = _mm_mul_ps(fscal,dy12);
1642             tz               = _mm_mul_ps(fscal,dz12);
1643
1644             /* Update vectorial force */
1645             fix1             = _mm_add_ps(fix1,tx);
1646             fiy1             = _mm_add_ps(fiy1,ty);
1647             fiz1             = _mm_add_ps(fiz1,tz);
1648
1649             fjx2             = _mm_add_ps(fjx2,tx);
1650             fjy2             = _mm_add_ps(fjy2,ty);
1651             fjz2             = _mm_add_ps(fjz2,tz);
1652
1653             }
1654
1655             /**************************
1656              * CALCULATE INTERACTIONS *
1657              **************************/
1658
1659             if (gmx_mm_any_lt(rsq20,rcutoff2))
1660             {
1661
1662             /* REACTION-FIELD ELECTROSTATICS */
1663             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1664
1665             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1666
1667             fscal            = felec;
1668
1669             fscal            = _mm_and_ps(fscal,cutoff_mask);
1670
1671             /* Calculate temporary vectorial force */
1672             tx               = _mm_mul_ps(fscal,dx20);
1673             ty               = _mm_mul_ps(fscal,dy20);
1674             tz               = _mm_mul_ps(fscal,dz20);
1675
1676             /* Update vectorial force */
1677             fix2             = _mm_add_ps(fix2,tx);
1678             fiy2             = _mm_add_ps(fiy2,ty);
1679             fiz2             = _mm_add_ps(fiz2,tz);
1680
1681             fjx0             = _mm_add_ps(fjx0,tx);
1682             fjy0             = _mm_add_ps(fjy0,ty);
1683             fjz0             = _mm_add_ps(fjz0,tz);
1684
1685             }
1686
1687             /**************************
1688              * CALCULATE INTERACTIONS *
1689              **************************/
1690
1691             if (gmx_mm_any_lt(rsq21,rcutoff2))
1692             {
1693
1694             /* REACTION-FIELD ELECTROSTATICS */
1695             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1696
1697             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1698
1699             fscal            = felec;
1700
1701             fscal            = _mm_and_ps(fscal,cutoff_mask);
1702
1703             /* Calculate temporary vectorial force */
1704             tx               = _mm_mul_ps(fscal,dx21);
1705             ty               = _mm_mul_ps(fscal,dy21);
1706             tz               = _mm_mul_ps(fscal,dz21);
1707
1708             /* Update vectorial force */
1709             fix2             = _mm_add_ps(fix2,tx);
1710             fiy2             = _mm_add_ps(fiy2,ty);
1711             fiz2             = _mm_add_ps(fiz2,tz);
1712
1713             fjx1             = _mm_add_ps(fjx1,tx);
1714             fjy1             = _mm_add_ps(fjy1,ty);
1715             fjz1             = _mm_add_ps(fjz1,tz);
1716
1717             }
1718
1719             /**************************
1720              * CALCULATE INTERACTIONS *
1721              **************************/
1722
1723             if (gmx_mm_any_lt(rsq22,rcutoff2))
1724             {
1725
1726             /* REACTION-FIELD ELECTROSTATICS */
1727             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1728
1729             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1730
1731             fscal            = felec;
1732
1733             fscal            = _mm_and_ps(fscal,cutoff_mask);
1734
1735             /* Calculate temporary vectorial force */
1736             tx               = _mm_mul_ps(fscal,dx22);
1737             ty               = _mm_mul_ps(fscal,dy22);
1738             tz               = _mm_mul_ps(fscal,dz22);
1739
1740             /* Update vectorial force */
1741             fix2             = _mm_add_ps(fix2,tx);
1742             fiy2             = _mm_add_ps(fiy2,ty);
1743             fiz2             = _mm_add_ps(fiz2,tz);
1744
1745             fjx2             = _mm_add_ps(fjx2,tx);
1746             fjy2             = _mm_add_ps(fjy2,ty);
1747             fjz2             = _mm_add_ps(fjz2,tz);
1748
1749             }
1750
1751             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1752                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1753                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1754
1755             /* Inner loop uses 301 flops */
1756         }
1757
1758         if(jidx<j_index_end)
1759         {
1760
1761             /* Get j neighbor index, and coordinate index */
1762             jnrA             = jjnr[jidx];
1763             jnrB             = jjnr[jidx+1];
1764             jnrC             = jjnr[jidx+2];
1765             jnrD             = jjnr[jidx+3];
1766
1767             /* Sign of each element will be negative for non-real atoms.
1768              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1769              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1770              */
1771             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1772             jnrA       = (jnrA>=0) ? jnrA : 0;
1773             jnrB       = (jnrB>=0) ? jnrB : 0;
1774             jnrC       = (jnrC>=0) ? jnrC : 0;
1775             jnrD       = (jnrD>=0) ? jnrD : 0;
1776
1777             j_coord_offsetA  = DIM*jnrA;
1778             j_coord_offsetB  = DIM*jnrB;
1779             j_coord_offsetC  = DIM*jnrC;
1780             j_coord_offsetD  = DIM*jnrD;
1781
1782             /* load j atom coordinates */
1783             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1784                                               x+j_coord_offsetC,x+j_coord_offsetD,
1785                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1786
1787             /* Calculate displacement vector */
1788             dx00             = _mm_sub_ps(ix0,jx0);
1789             dy00             = _mm_sub_ps(iy0,jy0);
1790             dz00             = _mm_sub_ps(iz0,jz0);
1791             dx01             = _mm_sub_ps(ix0,jx1);
1792             dy01             = _mm_sub_ps(iy0,jy1);
1793             dz01             = _mm_sub_ps(iz0,jz1);
1794             dx02             = _mm_sub_ps(ix0,jx2);
1795             dy02             = _mm_sub_ps(iy0,jy2);
1796             dz02             = _mm_sub_ps(iz0,jz2);
1797             dx10             = _mm_sub_ps(ix1,jx0);
1798             dy10             = _mm_sub_ps(iy1,jy0);
1799             dz10             = _mm_sub_ps(iz1,jz0);
1800             dx11             = _mm_sub_ps(ix1,jx1);
1801             dy11             = _mm_sub_ps(iy1,jy1);
1802             dz11             = _mm_sub_ps(iz1,jz1);
1803             dx12             = _mm_sub_ps(ix1,jx2);
1804             dy12             = _mm_sub_ps(iy1,jy2);
1805             dz12             = _mm_sub_ps(iz1,jz2);
1806             dx20             = _mm_sub_ps(ix2,jx0);
1807             dy20             = _mm_sub_ps(iy2,jy0);
1808             dz20             = _mm_sub_ps(iz2,jz0);
1809             dx21             = _mm_sub_ps(ix2,jx1);
1810             dy21             = _mm_sub_ps(iy2,jy1);
1811             dz21             = _mm_sub_ps(iz2,jz1);
1812             dx22             = _mm_sub_ps(ix2,jx2);
1813             dy22             = _mm_sub_ps(iy2,jy2);
1814             dz22             = _mm_sub_ps(iz2,jz2);
1815
1816             /* Calculate squared distance and things based on it */
1817             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1818             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1819             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1820             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1821             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1822             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1823             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1824             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1825             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1826
1827             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1828             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1829             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1830             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1831             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1832             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1833             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1834             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1835             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1836
1837             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1838             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1839             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1840             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1841             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1842             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1843             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1844             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1845             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1846
1847             fjx0             = _mm_setzero_ps();
1848             fjy0             = _mm_setzero_ps();
1849             fjz0             = _mm_setzero_ps();
1850             fjx1             = _mm_setzero_ps();
1851             fjy1             = _mm_setzero_ps();
1852             fjz1             = _mm_setzero_ps();
1853             fjx2             = _mm_setzero_ps();
1854             fjy2             = _mm_setzero_ps();
1855             fjz2             = _mm_setzero_ps();
1856
1857             /**************************
1858              * CALCULATE INTERACTIONS *
1859              **************************/
1860
1861             if (gmx_mm_any_lt(rsq00,rcutoff2))
1862             {
1863
1864             r00              = _mm_mul_ps(rsq00,rinv00);
1865             r00              = _mm_andnot_ps(dummy_mask,r00);
1866
1867             /* REACTION-FIELD ELECTROSTATICS */
1868             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1869
1870             /* LENNARD-JONES DISPERSION/REPULSION */
1871
1872             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1873             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1874             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1875             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1876             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1877
1878             d                = _mm_sub_ps(r00,rswitch);
1879             d                = _mm_max_ps(d,_mm_setzero_ps());
1880             d2               = _mm_mul_ps(d,d);
1881             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1882
1883             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1884
1885             /* Evaluate switch function */
1886             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1887             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1888             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1889
1890             fscal            = _mm_add_ps(felec,fvdw);
1891
1892             fscal            = _mm_and_ps(fscal,cutoff_mask);
1893
1894             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1895
1896             /* Calculate temporary vectorial force */
1897             tx               = _mm_mul_ps(fscal,dx00);
1898             ty               = _mm_mul_ps(fscal,dy00);
1899             tz               = _mm_mul_ps(fscal,dz00);
1900
1901             /* Update vectorial force */
1902             fix0             = _mm_add_ps(fix0,tx);
1903             fiy0             = _mm_add_ps(fiy0,ty);
1904             fiz0             = _mm_add_ps(fiz0,tz);
1905
1906             fjx0             = _mm_add_ps(fjx0,tx);
1907             fjy0             = _mm_add_ps(fjy0,ty);
1908             fjz0             = _mm_add_ps(fjz0,tz);
1909
1910             }
1911
1912             /**************************
1913              * CALCULATE INTERACTIONS *
1914              **************************/
1915
1916             if (gmx_mm_any_lt(rsq01,rcutoff2))
1917             {
1918
1919             /* REACTION-FIELD ELECTROSTATICS */
1920             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1921
1922             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1923
1924             fscal            = felec;
1925
1926             fscal            = _mm_and_ps(fscal,cutoff_mask);
1927
1928             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1929
1930             /* Calculate temporary vectorial force */
1931             tx               = _mm_mul_ps(fscal,dx01);
1932             ty               = _mm_mul_ps(fscal,dy01);
1933             tz               = _mm_mul_ps(fscal,dz01);
1934
1935             /* Update vectorial force */
1936             fix0             = _mm_add_ps(fix0,tx);
1937             fiy0             = _mm_add_ps(fiy0,ty);
1938             fiz0             = _mm_add_ps(fiz0,tz);
1939
1940             fjx1             = _mm_add_ps(fjx1,tx);
1941             fjy1             = _mm_add_ps(fjy1,ty);
1942             fjz1             = _mm_add_ps(fjz1,tz);
1943
1944             }
1945
1946             /**************************
1947              * CALCULATE INTERACTIONS *
1948              **************************/
1949
1950             if (gmx_mm_any_lt(rsq02,rcutoff2))
1951             {
1952
1953             /* REACTION-FIELD ELECTROSTATICS */
1954             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1955
1956             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1957
1958             fscal            = felec;
1959
1960             fscal            = _mm_and_ps(fscal,cutoff_mask);
1961
1962             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1963
1964             /* Calculate temporary vectorial force */
1965             tx               = _mm_mul_ps(fscal,dx02);
1966             ty               = _mm_mul_ps(fscal,dy02);
1967             tz               = _mm_mul_ps(fscal,dz02);
1968
1969             /* Update vectorial force */
1970             fix0             = _mm_add_ps(fix0,tx);
1971             fiy0             = _mm_add_ps(fiy0,ty);
1972             fiz0             = _mm_add_ps(fiz0,tz);
1973
1974             fjx2             = _mm_add_ps(fjx2,tx);
1975             fjy2             = _mm_add_ps(fjy2,ty);
1976             fjz2             = _mm_add_ps(fjz2,tz);
1977
1978             }
1979
1980             /**************************
1981              * CALCULATE INTERACTIONS *
1982              **************************/
1983
1984             if (gmx_mm_any_lt(rsq10,rcutoff2))
1985             {
1986
1987             /* REACTION-FIELD ELECTROSTATICS */
1988             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1989
1990             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1991
1992             fscal            = felec;
1993
1994             fscal            = _mm_and_ps(fscal,cutoff_mask);
1995
1996             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1997
1998             /* Calculate temporary vectorial force */
1999             tx               = _mm_mul_ps(fscal,dx10);
2000             ty               = _mm_mul_ps(fscal,dy10);
2001             tz               = _mm_mul_ps(fscal,dz10);
2002
2003             /* Update vectorial force */
2004             fix1             = _mm_add_ps(fix1,tx);
2005             fiy1             = _mm_add_ps(fiy1,ty);
2006             fiz1             = _mm_add_ps(fiz1,tz);
2007
2008             fjx0             = _mm_add_ps(fjx0,tx);
2009             fjy0             = _mm_add_ps(fjy0,ty);
2010             fjz0             = _mm_add_ps(fjz0,tz);
2011
2012             }
2013
2014             /**************************
2015              * CALCULATE INTERACTIONS *
2016              **************************/
2017
2018             if (gmx_mm_any_lt(rsq11,rcutoff2))
2019             {
2020
2021             /* REACTION-FIELD ELECTROSTATICS */
2022             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2023
2024             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2025
2026             fscal            = felec;
2027
2028             fscal            = _mm_and_ps(fscal,cutoff_mask);
2029
2030             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2031
2032             /* Calculate temporary vectorial force */
2033             tx               = _mm_mul_ps(fscal,dx11);
2034             ty               = _mm_mul_ps(fscal,dy11);
2035             tz               = _mm_mul_ps(fscal,dz11);
2036
2037             /* Update vectorial force */
2038             fix1             = _mm_add_ps(fix1,tx);
2039             fiy1             = _mm_add_ps(fiy1,ty);
2040             fiz1             = _mm_add_ps(fiz1,tz);
2041
2042             fjx1             = _mm_add_ps(fjx1,tx);
2043             fjy1             = _mm_add_ps(fjy1,ty);
2044             fjz1             = _mm_add_ps(fjz1,tz);
2045
2046             }
2047
2048             /**************************
2049              * CALCULATE INTERACTIONS *
2050              **************************/
2051
2052             if (gmx_mm_any_lt(rsq12,rcutoff2))
2053             {
2054
2055             /* REACTION-FIELD ELECTROSTATICS */
2056             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2057
2058             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2059
2060             fscal            = felec;
2061
2062             fscal            = _mm_and_ps(fscal,cutoff_mask);
2063
2064             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2065
2066             /* Calculate temporary vectorial force */
2067             tx               = _mm_mul_ps(fscal,dx12);
2068             ty               = _mm_mul_ps(fscal,dy12);
2069             tz               = _mm_mul_ps(fscal,dz12);
2070
2071             /* Update vectorial force */
2072             fix1             = _mm_add_ps(fix1,tx);
2073             fiy1             = _mm_add_ps(fiy1,ty);
2074             fiz1             = _mm_add_ps(fiz1,tz);
2075
2076             fjx2             = _mm_add_ps(fjx2,tx);
2077             fjy2             = _mm_add_ps(fjy2,ty);
2078             fjz2             = _mm_add_ps(fjz2,tz);
2079
2080             }
2081
2082             /**************************
2083              * CALCULATE INTERACTIONS *
2084              **************************/
2085
2086             if (gmx_mm_any_lt(rsq20,rcutoff2))
2087             {
2088
2089             /* REACTION-FIELD ELECTROSTATICS */
2090             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2091
2092             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2093
2094             fscal            = felec;
2095
2096             fscal            = _mm_and_ps(fscal,cutoff_mask);
2097
2098             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2099
2100             /* Calculate temporary vectorial force */
2101             tx               = _mm_mul_ps(fscal,dx20);
2102             ty               = _mm_mul_ps(fscal,dy20);
2103             tz               = _mm_mul_ps(fscal,dz20);
2104
2105             /* Update vectorial force */
2106             fix2             = _mm_add_ps(fix2,tx);
2107             fiy2             = _mm_add_ps(fiy2,ty);
2108             fiz2             = _mm_add_ps(fiz2,tz);
2109
2110             fjx0             = _mm_add_ps(fjx0,tx);
2111             fjy0             = _mm_add_ps(fjy0,ty);
2112             fjz0             = _mm_add_ps(fjz0,tz);
2113
2114             }
2115
2116             /**************************
2117              * CALCULATE INTERACTIONS *
2118              **************************/
2119
2120             if (gmx_mm_any_lt(rsq21,rcutoff2))
2121             {
2122
2123             /* REACTION-FIELD ELECTROSTATICS */
2124             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2125
2126             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2127
2128             fscal            = felec;
2129
2130             fscal            = _mm_and_ps(fscal,cutoff_mask);
2131
2132             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2133
2134             /* Calculate temporary vectorial force */
2135             tx               = _mm_mul_ps(fscal,dx21);
2136             ty               = _mm_mul_ps(fscal,dy21);
2137             tz               = _mm_mul_ps(fscal,dz21);
2138
2139             /* Update vectorial force */
2140             fix2             = _mm_add_ps(fix2,tx);
2141             fiy2             = _mm_add_ps(fiy2,ty);
2142             fiz2             = _mm_add_ps(fiz2,tz);
2143
2144             fjx1             = _mm_add_ps(fjx1,tx);
2145             fjy1             = _mm_add_ps(fjy1,ty);
2146             fjz1             = _mm_add_ps(fjz1,tz);
2147
2148             }
2149
2150             /**************************
2151              * CALCULATE INTERACTIONS *
2152              **************************/
2153
2154             if (gmx_mm_any_lt(rsq22,rcutoff2))
2155             {
2156
2157             /* REACTION-FIELD ELECTROSTATICS */
2158             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2159
2160             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2161
2162             fscal            = felec;
2163
2164             fscal            = _mm_and_ps(fscal,cutoff_mask);
2165
2166             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2167
2168             /* Calculate temporary vectorial force */
2169             tx               = _mm_mul_ps(fscal,dx22);
2170             ty               = _mm_mul_ps(fscal,dy22);
2171             tz               = _mm_mul_ps(fscal,dz22);
2172
2173             /* Update vectorial force */
2174             fix2             = _mm_add_ps(fix2,tx);
2175             fiy2             = _mm_add_ps(fiy2,ty);
2176             fiz2             = _mm_add_ps(fiz2,tz);
2177
2178             fjx2             = _mm_add_ps(fjx2,tx);
2179             fjy2             = _mm_add_ps(fjy2,ty);
2180             fjz2             = _mm_add_ps(fjz2,tz);
2181
2182             }
2183
2184             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2185                                                    f+j_coord_offsetC,f+j_coord_offsetD,
2186                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2187
2188             /* Inner loop uses 302 flops */
2189         }
2190
2191         /* End of innermost loop */
2192
2193         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2194                                               f+i_coord_offset,fshift+i_shift_offset);
2195
2196         /* Increment number of inner iterations */
2197         inneriter                  += j_index_end - j_index_start;
2198
2199         /* Outer loop uses 27 flops */
2200     }
2201
2202     /* Increment number of outer iterations */
2203     outeriter        += nri;
2204
2205     /* Update outer/inner flops */
2206
2207     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*302);
2208 }