Rename remaining GMX_ACCELERATION to GMX_CPU_ACCELERATION
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
94     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
95     __m128           dummy_mask,cutoff_mask;
96     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
97     __m128           one     = _mm_set1_ps(1.0);
98     __m128           two     = _mm_set1_ps(2.0);
99     x                = xx[0];
100     f                = ff[0];
101
102     nri              = nlist->nri;
103     iinr             = nlist->iinr;
104     jindex           = nlist->jindex;
105     jjnr             = nlist->jjnr;
106     shiftidx         = nlist->shift;
107     gid              = nlist->gid;
108     shiftvec         = fr->shift_vec[0];
109     fshift           = fr->fshift[0];
110     facel            = _mm_set1_ps(fr->epsfac);
111     charge           = mdatoms->chargeA;
112     krf              = _mm_set1_ps(fr->ic->k_rf);
113     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
114     crf              = _mm_set1_ps(fr->ic->c_rf);
115     nvdwtype         = fr->ntype;
116     vdwparam         = fr->nbfp;
117     vdwtype          = mdatoms->typeA;
118
119     /* Setup water-specific parameters */
120     inr              = nlist->iinr[0];
121     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
122     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
123     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
124     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
125
126     jq0              = _mm_set1_ps(charge[inr+0]);
127     jq1              = _mm_set1_ps(charge[inr+1]);
128     jq2              = _mm_set1_ps(charge[inr+2]);
129     vdwjidx0A        = 2*vdwtype[inr+0];
130     qq00             = _mm_mul_ps(iq0,jq0);
131     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
132     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
133     qq01             = _mm_mul_ps(iq0,jq1);
134     qq02             = _mm_mul_ps(iq0,jq2);
135     qq10             = _mm_mul_ps(iq1,jq0);
136     qq11             = _mm_mul_ps(iq1,jq1);
137     qq12             = _mm_mul_ps(iq1,jq2);
138     qq20             = _mm_mul_ps(iq2,jq0);
139     qq21             = _mm_mul_ps(iq2,jq1);
140     qq22             = _mm_mul_ps(iq2,jq2);
141
142     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
143     rcutoff_scalar   = fr->rcoulomb;
144     rcutoff          = _mm_set1_ps(rcutoff_scalar);
145     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
146
147     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
148     rvdw             = _mm_set1_ps(fr->rvdw);
149
150     /* Avoid stupid compiler warnings */
151     jnrA = jnrB = jnrC = jnrD = 0;
152     j_coord_offsetA = 0;
153     j_coord_offsetB = 0;
154     j_coord_offsetC = 0;
155     j_coord_offsetD = 0;
156
157     outeriter        = 0;
158     inneriter        = 0;
159
160     /* Start outer loop over neighborlists */
161     for(iidx=0; iidx<nri; iidx++)
162     {
163         /* Load shift vector for this list */
164         i_shift_offset   = DIM*shiftidx[iidx];
165         shX              = shiftvec[i_shift_offset+XX];
166         shY              = shiftvec[i_shift_offset+YY];
167         shZ              = shiftvec[i_shift_offset+ZZ];
168
169         /* Load limits for loop over neighbors */
170         j_index_start    = jindex[iidx];
171         j_index_end      = jindex[iidx+1];
172
173         /* Get outer coordinate index */
174         inr              = iinr[iidx];
175         i_coord_offset   = DIM*inr;
176
177         /* Load i particle coords and add shift vector */
178         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
179         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
180         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
181         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
182         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
183         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
184         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
185         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
186         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
187
188         fix0             = _mm_setzero_ps();
189         fiy0             = _mm_setzero_ps();
190         fiz0             = _mm_setzero_ps();
191         fix1             = _mm_setzero_ps();
192         fiy1             = _mm_setzero_ps();
193         fiz1             = _mm_setzero_ps();
194         fix2             = _mm_setzero_ps();
195         fiy2             = _mm_setzero_ps();
196         fiz2             = _mm_setzero_ps();
197
198         /* Reset potential sums */
199         velecsum         = _mm_setzero_ps();
200         vvdwsum          = _mm_setzero_ps();
201
202         /* Start inner kernel loop */
203         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
204         {
205
206             /* Get j neighbor index, and coordinate index */
207             jnrA             = jjnr[jidx];
208             jnrB             = jjnr[jidx+1];
209             jnrC             = jjnr[jidx+2];
210             jnrD             = jjnr[jidx+3];
211
212             j_coord_offsetA  = DIM*jnrA;
213             j_coord_offsetB  = DIM*jnrB;
214             j_coord_offsetC  = DIM*jnrC;
215             j_coord_offsetD  = DIM*jnrD;
216
217             /* load j atom coordinates */
218             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
219                                               x+j_coord_offsetC,x+j_coord_offsetD,
220                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
221
222             /* Calculate displacement vector */
223             dx00             = _mm_sub_ps(ix0,jx0);
224             dy00             = _mm_sub_ps(iy0,jy0);
225             dz00             = _mm_sub_ps(iz0,jz0);
226             dx01             = _mm_sub_ps(ix0,jx1);
227             dy01             = _mm_sub_ps(iy0,jy1);
228             dz01             = _mm_sub_ps(iz0,jz1);
229             dx02             = _mm_sub_ps(ix0,jx2);
230             dy02             = _mm_sub_ps(iy0,jy2);
231             dz02             = _mm_sub_ps(iz0,jz2);
232             dx10             = _mm_sub_ps(ix1,jx0);
233             dy10             = _mm_sub_ps(iy1,jy0);
234             dz10             = _mm_sub_ps(iz1,jz0);
235             dx11             = _mm_sub_ps(ix1,jx1);
236             dy11             = _mm_sub_ps(iy1,jy1);
237             dz11             = _mm_sub_ps(iz1,jz1);
238             dx12             = _mm_sub_ps(ix1,jx2);
239             dy12             = _mm_sub_ps(iy1,jy2);
240             dz12             = _mm_sub_ps(iz1,jz2);
241             dx20             = _mm_sub_ps(ix2,jx0);
242             dy20             = _mm_sub_ps(iy2,jy0);
243             dz20             = _mm_sub_ps(iz2,jz0);
244             dx21             = _mm_sub_ps(ix2,jx1);
245             dy21             = _mm_sub_ps(iy2,jy1);
246             dz21             = _mm_sub_ps(iz2,jz1);
247             dx22             = _mm_sub_ps(ix2,jx2);
248             dy22             = _mm_sub_ps(iy2,jy2);
249             dz22             = _mm_sub_ps(iz2,jz2);
250
251             /* Calculate squared distance and things based on it */
252             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
253             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
254             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
255             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
256             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
257             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
258             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
259             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
260             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
261
262             rinv00           = gmx_mm_invsqrt_ps(rsq00);
263             rinv01           = gmx_mm_invsqrt_ps(rsq01);
264             rinv02           = gmx_mm_invsqrt_ps(rsq02);
265             rinv10           = gmx_mm_invsqrt_ps(rsq10);
266             rinv11           = gmx_mm_invsqrt_ps(rsq11);
267             rinv12           = gmx_mm_invsqrt_ps(rsq12);
268             rinv20           = gmx_mm_invsqrt_ps(rsq20);
269             rinv21           = gmx_mm_invsqrt_ps(rsq21);
270             rinv22           = gmx_mm_invsqrt_ps(rsq22);
271
272             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
273             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
274             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
275             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
276             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
277             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
278             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
279             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
280             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
281
282             fjx0             = _mm_setzero_ps();
283             fjy0             = _mm_setzero_ps();
284             fjz0             = _mm_setzero_ps();
285             fjx1             = _mm_setzero_ps();
286             fjy1             = _mm_setzero_ps();
287             fjz1             = _mm_setzero_ps();
288             fjx2             = _mm_setzero_ps();
289             fjy2             = _mm_setzero_ps();
290             fjz2             = _mm_setzero_ps();
291
292             /**************************
293              * CALCULATE INTERACTIONS *
294              **************************/
295
296             if (gmx_mm_any_lt(rsq00,rcutoff2))
297             {
298
299             /* REACTION-FIELD ELECTROSTATICS */
300             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
301             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
302
303             /* LENNARD-JONES DISPERSION/REPULSION */
304
305             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
306             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
307             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
308             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
309                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
310             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
311
312             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
313
314             /* Update potential sum for this i atom from the interaction with this j atom. */
315             velec            = _mm_and_ps(velec,cutoff_mask);
316             velecsum         = _mm_add_ps(velecsum,velec);
317             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
318             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
319
320             fscal            = _mm_add_ps(felec,fvdw);
321
322             fscal            = _mm_and_ps(fscal,cutoff_mask);
323
324             /* Calculate temporary vectorial force */
325             tx               = _mm_mul_ps(fscal,dx00);
326             ty               = _mm_mul_ps(fscal,dy00);
327             tz               = _mm_mul_ps(fscal,dz00);
328
329             /* Update vectorial force */
330             fix0             = _mm_add_ps(fix0,tx);
331             fiy0             = _mm_add_ps(fiy0,ty);
332             fiz0             = _mm_add_ps(fiz0,tz);
333
334             fjx0             = _mm_add_ps(fjx0,tx);
335             fjy0             = _mm_add_ps(fjy0,ty);
336             fjz0             = _mm_add_ps(fjz0,tz);
337
338             }
339
340             /**************************
341              * CALCULATE INTERACTIONS *
342              **************************/
343
344             if (gmx_mm_any_lt(rsq01,rcutoff2))
345             {
346
347             /* REACTION-FIELD ELECTROSTATICS */
348             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
349             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
350
351             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
352
353             /* Update potential sum for this i atom from the interaction with this j atom. */
354             velec            = _mm_and_ps(velec,cutoff_mask);
355             velecsum         = _mm_add_ps(velecsum,velec);
356
357             fscal            = felec;
358
359             fscal            = _mm_and_ps(fscal,cutoff_mask);
360
361             /* Calculate temporary vectorial force */
362             tx               = _mm_mul_ps(fscal,dx01);
363             ty               = _mm_mul_ps(fscal,dy01);
364             tz               = _mm_mul_ps(fscal,dz01);
365
366             /* Update vectorial force */
367             fix0             = _mm_add_ps(fix0,tx);
368             fiy0             = _mm_add_ps(fiy0,ty);
369             fiz0             = _mm_add_ps(fiz0,tz);
370
371             fjx1             = _mm_add_ps(fjx1,tx);
372             fjy1             = _mm_add_ps(fjy1,ty);
373             fjz1             = _mm_add_ps(fjz1,tz);
374
375             }
376
377             /**************************
378              * CALCULATE INTERACTIONS *
379              **************************/
380
381             if (gmx_mm_any_lt(rsq02,rcutoff2))
382             {
383
384             /* REACTION-FIELD ELECTROSTATICS */
385             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
386             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
387
388             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
389
390             /* Update potential sum for this i atom from the interaction with this j atom. */
391             velec            = _mm_and_ps(velec,cutoff_mask);
392             velecsum         = _mm_add_ps(velecsum,velec);
393
394             fscal            = felec;
395
396             fscal            = _mm_and_ps(fscal,cutoff_mask);
397
398             /* Calculate temporary vectorial force */
399             tx               = _mm_mul_ps(fscal,dx02);
400             ty               = _mm_mul_ps(fscal,dy02);
401             tz               = _mm_mul_ps(fscal,dz02);
402
403             /* Update vectorial force */
404             fix0             = _mm_add_ps(fix0,tx);
405             fiy0             = _mm_add_ps(fiy0,ty);
406             fiz0             = _mm_add_ps(fiz0,tz);
407
408             fjx2             = _mm_add_ps(fjx2,tx);
409             fjy2             = _mm_add_ps(fjy2,ty);
410             fjz2             = _mm_add_ps(fjz2,tz);
411
412             }
413
414             /**************************
415              * CALCULATE INTERACTIONS *
416              **************************/
417
418             if (gmx_mm_any_lt(rsq10,rcutoff2))
419             {
420
421             /* REACTION-FIELD ELECTROSTATICS */
422             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
423             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
424
425             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
426
427             /* Update potential sum for this i atom from the interaction with this j atom. */
428             velec            = _mm_and_ps(velec,cutoff_mask);
429             velecsum         = _mm_add_ps(velecsum,velec);
430
431             fscal            = felec;
432
433             fscal            = _mm_and_ps(fscal,cutoff_mask);
434
435             /* Calculate temporary vectorial force */
436             tx               = _mm_mul_ps(fscal,dx10);
437             ty               = _mm_mul_ps(fscal,dy10);
438             tz               = _mm_mul_ps(fscal,dz10);
439
440             /* Update vectorial force */
441             fix1             = _mm_add_ps(fix1,tx);
442             fiy1             = _mm_add_ps(fiy1,ty);
443             fiz1             = _mm_add_ps(fiz1,tz);
444
445             fjx0             = _mm_add_ps(fjx0,tx);
446             fjy0             = _mm_add_ps(fjy0,ty);
447             fjz0             = _mm_add_ps(fjz0,tz);
448
449             }
450
451             /**************************
452              * CALCULATE INTERACTIONS *
453              **************************/
454
455             if (gmx_mm_any_lt(rsq11,rcutoff2))
456             {
457
458             /* REACTION-FIELD ELECTROSTATICS */
459             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
460             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
461
462             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
463
464             /* Update potential sum for this i atom from the interaction with this j atom. */
465             velec            = _mm_and_ps(velec,cutoff_mask);
466             velecsum         = _mm_add_ps(velecsum,velec);
467
468             fscal            = felec;
469
470             fscal            = _mm_and_ps(fscal,cutoff_mask);
471
472             /* Calculate temporary vectorial force */
473             tx               = _mm_mul_ps(fscal,dx11);
474             ty               = _mm_mul_ps(fscal,dy11);
475             tz               = _mm_mul_ps(fscal,dz11);
476
477             /* Update vectorial force */
478             fix1             = _mm_add_ps(fix1,tx);
479             fiy1             = _mm_add_ps(fiy1,ty);
480             fiz1             = _mm_add_ps(fiz1,tz);
481
482             fjx1             = _mm_add_ps(fjx1,tx);
483             fjy1             = _mm_add_ps(fjy1,ty);
484             fjz1             = _mm_add_ps(fjz1,tz);
485
486             }
487
488             /**************************
489              * CALCULATE INTERACTIONS *
490              **************************/
491
492             if (gmx_mm_any_lt(rsq12,rcutoff2))
493             {
494
495             /* REACTION-FIELD ELECTROSTATICS */
496             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
497             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
498
499             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
500
501             /* Update potential sum for this i atom from the interaction with this j atom. */
502             velec            = _mm_and_ps(velec,cutoff_mask);
503             velecsum         = _mm_add_ps(velecsum,velec);
504
505             fscal            = felec;
506
507             fscal            = _mm_and_ps(fscal,cutoff_mask);
508
509             /* Calculate temporary vectorial force */
510             tx               = _mm_mul_ps(fscal,dx12);
511             ty               = _mm_mul_ps(fscal,dy12);
512             tz               = _mm_mul_ps(fscal,dz12);
513
514             /* Update vectorial force */
515             fix1             = _mm_add_ps(fix1,tx);
516             fiy1             = _mm_add_ps(fiy1,ty);
517             fiz1             = _mm_add_ps(fiz1,tz);
518
519             fjx2             = _mm_add_ps(fjx2,tx);
520             fjy2             = _mm_add_ps(fjy2,ty);
521             fjz2             = _mm_add_ps(fjz2,tz);
522
523             }
524
525             /**************************
526              * CALCULATE INTERACTIONS *
527              **************************/
528
529             if (gmx_mm_any_lt(rsq20,rcutoff2))
530             {
531
532             /* REACTION-FIELD ELECTROSTATICS */
533             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
534             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
535
536             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
537
538             /* Update potential sum for this i atom from the interaction with this j atom. */
539             velec            = _mm_and_ps(velec,cutoff_mask);
540             velecsum         = _mm_add_ps(velecsum,velec);
541
542             fscal            = felec;
543
544             fscal            = _mm_and_ps(fscal,cutoff_mask);
545
546             /* Calculate temporary vectorial force */
547             tx               = _mm_mul_ps(fscal,dx20);
548             ty               = _mm_mul_ps(fscal,dy20);
549             tz               = _mm_mul_ps(fscal,dz20);
550
551             /* Update vectorial force */
552             fix2             = _mm_add_ps(fix2,tx);
553             fiy2             = _mm_add_ps(fiy2,ty);
554             fiz2             = _mm_add_ps(fiz2,tz);
555
556             fjx0             = _mm_add_ps(fjx0,tx);
557             fjy0             = _mm_add_ps(fjy0,ty);
558             fjz0             = _mm_add_ps(fjz0,tz);
559
560             }
561
562             /**************************
563              * CALCULATE INTERACTIONS *
564              **************************/
565
566             if (gmx_mm_any_lt(rsq21,rcutoff2))
567             {
568
569             /* REACTION-FIELD ELECTROSTATICS */
570             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
571             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
572
573             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
574
575             /* Update potential sum for this i atom from the interaction with this j atom. */
576             velec            = _mm_and_ps(velec,cutoff_mask);
577             velecsum         = _mm_add_ps(velecsum,velec);
578
579             fscal            = felec;
580
581             fscal            = _mm_and_ps(fscal,cutoff_mask);
582
583             /* Calculate temporary vectorial force */
584             tx               = _mm_mul_ps(fscal,dx21);
585             ty               = _mm_mul_ps(fscal,dy21);
586             tz               = _mm_mul_ps(fscal,dz21);
587
588             /* Update vectorial force */
589             fix2             = _mm_add_ps(fix2,tx);
590             fiy2             = _mm_add_ps(fiy2,ty);
591             fiz2             = _mm_add_ps(fiz2,tz);
592
593             fjx1             = _mm_add_ps(fjx1,tx);
594             fjy1             = _mm_add_ps(fjy1,ty);
595             fjz1             = _mm_add_ps(fjz1,tz);
596
597             }
598
599             /**************************
600              * CALCULATE INTERACTIONS *
601              **************************/
602
603             if (gmx_mm_any_lt(rsq22,rcutoff2))
604             {
605
606             /* REACTION-FIELD ELECTROSTATICS */
607             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
608             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
609
610             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
611
612             /* Update potential sum for this i atom from the interaction with this j atom. */
613             velec            = _mm_and_ps(velec,cutoff_mask);
614             velecsum         = _mm_add_ps(velecsum,velec);
615
616             fscal            = felec;
617
618             fscal            = _mm_and_ps(fscal,cutoff_mask);
619
620             /* Calculate temporary vectorial force */
621             tx               = _mm_mul_ps(fscal,dx22);
622             ty               = _mm_mul_ps(fscal,dy22);
623             tz               = _mm_mul_ps(fscal,dz22);
624
625             /* Update vectorial force */
626             fix2             = _mm_add_ps(fix2,tx);
627             fiy2             = _mm_add_ps(fiy2,ty);
628             fiz2             = _mm_add_ps(fiz2,tz);
629
630             fjx2             = _mm_add_ps(fjx2,tx);
631             fjy2             = _mm_add_ps(fjy2,ty);
632             fjz2             = _mm_add_ps(fjz2,tz);
633
634             }
635
636             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
637                                                    f+j_coord_offsetC,f+j_coord_offsetD,
638                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
639
640             /* Inner loop uses 342 flops */
641         }
642
643         if(jidx<j_index_end)
644         {
645
646             /* Get j neighbor index, and coordinate index */
647             jnrA             = jjnr[jidx];
648             jnrB             = jjnr[jidx+1];
649             jnrC             = jjnr[jidx+2];
650             jnrD             = jjnr[jidx+3];
651
652             /* Sign of each element will be negative for non-real atoms.
653              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
654              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
655              */
656             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
657             jnrA       = (jnrA>=0) ? jnrA : 0;
658             jnrB       = (jnrB>=0) ? jnrB : 0;
659             jnrC       = (jnrC>=0) ? jnrC : 0;
660             jnrD       = (jnrD>=0) ? jnrD : 0;
661
662             j_coord_offsetA  = DIM*jnrA;
663             j_coord_offsetB  = DIM*jnrB;
664             j_coord_offsetC  = DIM*jnrC;
665             j_coord_offsetD  = DIM*jnrD;
666
667             /* load j atom coordinates */
668             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
669                                               x+j_coord_offsetC,x+j_coord_offsetD,
670                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
671
672             /* Calculate displacement vector */
673             dx00             = _mm_sub_ps(ix0,jx0);
674             dy00             = _mm_sub_ps(iy0,jy0);
675             dz00             = _mm_sub_ps(iz0,jz0);
676             dx01             = _mm_sub_ps(ix0,jx1);
677             dy01             = _mm_sub_ps(iy0,jy1);
678             dz01             = _mm_sub_ps(iz0,jz1);
679             dx02             = _mm_sub_ps(ix0,jx2);
680             dy02             = _mm_sub_ps(iy0,jy2);
681             dz02             = _mm_sub_ps(iz0,jz2);
682             dx10             = _mm_sub_ps(ix1,jx0);
683             dy10             = _mm_sub_ps(iy1,jy0);
684             dz10             = _mm_sub_ps(iz1,jz0);
685             dx11             = _mm_sub_ps(ix1,jx1);
686             dy11             = _mm_sub_ps(iy1,jy1);
687             dz11             = _mm_sub_ps(iz1,jz1);
688             dx12             = _mm_sub_ps(ix1,jx2);
689             dy12             = _mm_sub_ps(iy1,jy2);
690             dz12             = _mm_sub_ps(iz1,jz2);
691             dx20             = _mm_sub_ps(ix2,jx0);
692             dy20             = _mm_sub_ps(iy2,jy0);
693             dz20             = _mm_sub_ps(iz2,jz0);
694             dx21             = _mm_sub_ps(ix2,jx1);
695             dy21             = _mm_sub_ps(iy2,jy1);
696             dz21             = _mm_sub_ps(iz2,jz1);
697             dx22             = _mm_sub_ps(ix2,jx2);
698             dy22             = _mm_sub_ps(iy2,jy2);
699             dz22             = _mm_sub_ps(iz2,jz2);
700
701             /* Calculate squared distance and things based on it */
702             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
703             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
704             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
705             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
706             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
707             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
708             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
709             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
710             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
711
712             rinv00           = gmx_mm_invsqrt_ps(rsq00);
713             rinv01           = gmx_mm_invsqrt_ps(rsq01);
714             rinv02           = gmx_mm_invsqrt_ps(rsq02);
715             rinv10           = gmx_mm_invsqrt_ps(rsq10);
716             rinv11           = gmx_mm_invsqrt_ps(rsq11);
717             rinv12           = gmx_mm_invsqrt_ps(rsq12);
718             rinv20           = gmx_mm_invsqrt_ps(rsq20);
719             rinv21           = gmx_mm_invsqrt_ps(rsq21);
720             rinv22           = gmx_mm_invsqrt_ps(rsq22);
721
722             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
723             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
724             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
725             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
726             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
727             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
728             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
729             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
730             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
731
732             fjx0             = _mm_setzero_ps();
733             fjy0             = _mm_setzero_ps();
734             fjz0             = _mm_setzero_ps();
735             fjx1             = _mm_setzero_ps();
736             fjy1             = _mm_setzero_ps();
737             fjz1             = _mm_setzero_ps();
738             fjx2             = _mm_setzero_ps();
739             fjy2             = _mm_setzero_ps();
740             fjz2             = _mm_setzero_ps();
741
742             /**************************
743              * CALCULATE INTERACTIONS *
744              **************************/
745
746             if (gmx_mm_any_lt(rsq00,rcutoff2))
747             {
748
749             /* REACTION-FIELD ELECTROSTATICS */
750             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
751             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
752
753             /* LENNARD-JONES DISPERSION/REPULSION */
754
755             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
756             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
757             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
758             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
759                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
760             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
761
762             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
763
764             /* Update potential sum for this i atom from the interaction with this j atom. */
765             velec            = _mm_and_ps(velec,cutoff_mask);
766             velec            = _mm_andnot_ps(dummy_mask,velec);
767             velecsum         = _mm_add_ps(velecsum,velec);
768             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
769             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
770             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
771
772             fscal            = _mm_add_ps(felec,fvdw);
773
774             fscal            = _mm_and_ps(fscal,cutoff_mask);
775
776             fscal            = _mm_andnot_ps(dummy_mask,fscal);
777
778             /* Calculate temporary vectorial force */
779             tx               = _mm_mul_ps(fscal,dx00);
780             ty               = _mm_mul_ps(fscal,dy00);
781             tz               = _mm_mul_ps(fscal,dz00);
782
783             /* Update vectorial force */
784             fix0             = _mm_add_ps(fix0,tx);
785             fiy0             = _mm_add_ps(fiy0,ty);
786             fiz0             = _mm_add_ps(fiz0,tz);
787
788             fjx0             = _mm_add_ps(fjx0,tx);
789             fjy0             = _mm_add_ps(fjy0,ty);
790             fjz0             = _mm_add_ps(fjz0,tz);
791
792             }
793
794             /**************************
795              * CALCULATE INTERACTIONS *
796              **************************/
797
798             if (gmx_mm_any_lt(rsq01,rcutoff2))
799             {
800
801             /* REACTION-FIELD ELECTROSTATICS */
802             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
803             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
804
805             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
806
807             /* Update potential sum for this i atom from the interaction with this j atom. */
808             velec            = _mm_and_ps(velec,cutoff_mask);
809             velec            = _mm_andnot_ps(dummy_mask,velec);
810             velecsum         = _mm_add_ps(velecsum,velec);
811
812             fscal            = felec;
813
814             fscal            = _mm_and_ps(fscal,cutoff_mask);
815
816             fscal            = _mm_andnot_ps(dummy_mask,fscal);
817
818             /* Calculate temporary vectorial force */
819             tx               = _mm_mul_ps(fscal,dx01);
820             ty               = _mm_mul_ps(fscal,dy01);
821             tz               = _mm_mul_ps(fscal,dz01);
822
823             /* Update vectorial force */
824             fix0             = _mm_add_ps(fix0,tx);
825             fiy0             = _mm_add_ps(fiy0,ty);
826             fiz0             = _mm_add_ps(fiz0,tz);
827
828             fjx1             = _mm_add_ps(fjx1,tx);
829             fjy1             = _mm_add_ps(fjy1,ty);
830             fjz1             = _mm_add_ps(fjz1,tz);
831
832             }
833
834             /**************************
835              * CALCULATE INTERACTIONS *
836              **************************/
837
838             if (gmx_mm_any_lt(rsq02,rcutoff2))
839             {
840
841             /* REACTION-FIELD ELECTROSTATICS */
842             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
843             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
844
845             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
846
847             /* Update potential sum for this i atom from the interaction with this j atom. */
848             velec            = _mm_and_ps(velec,cutoff_mask);
849             velec            = _mm_andnot_ps(dummy_mask,velec);
850             velecsum         = _mm_add_ps(velecsum,velec);
851
852             fscal            = felec;
853
854             fscal            = _mm_and_ps(fscal,cutoff_mask);
855
856             fscal            = _mm_andnot_ps(dummy_mask,fscal);
857
858             /* Calculate temporary vectorial force */
859             tx               = _mm_mul_ps(fscal,dx02);
860             ty               = _mm_mul_ps(fscal,dy02);
861             tz               = _mm_mul_ps(fscal,dz02);
862
863             /* Update vectorial force */
864             fix0             = _mm_add_ps(fix0,tx);
865             fiy0             = _mm_add_ps(fiy0,ty);
866             fiz0             = _mm_add_ps(fiz0,tz);
867
868             fjx2             = _mm_add_ps(fjx2,tx);
869             fjy2             = _mm_add_ps(fjy2,ty);
870             fjz2             = _mm_add_ps(fjz2,tz);
871
872             }
873
874             /**************************
875              * CALCULATE INTERACTIONS *
876              **************************/
877
878             if (gmx_mm_any_lt(rsq10,rcutoff2))
879             {
880
881             /* REACTION-FIELD ELECTROSTATICS */
882             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
883             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
884
885             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
886
887             /* Update potential sum for this i atom from the interaction with this j atom. */
888             velec            = _mm_and_ps(velec,cutoff_mask);
889             velec            = _mm_andnot_ps(dummy_mask,velec);
890             velecsum         = _mm_add_ps(velecsum,velec);
891
892             fscal            = felec;
893
894             fscal            = _mm_and_ps(fscal,cutoff_mask);
895
896             fscal            = _mm_andnot_ps(dummy_mask,fscal);
897
898             /* Calculate temporary vectorial force */
899             tx               = _mm_mul_ps(fscal,dx10);
900             ty               = _mm_mul_ps(fscal,dy10);
901             tz               = _mm_mul_ps(fscal,dz10);
902
903             /* Update vectorial force */
904             fix1             = _mm_add_ps(fix1,tx);
905             fiy1             = _mm_add_ps(fiy1,ty);
906             fiz1             = _mm_add_ps(fiz1,tz);
907
908             fjx0             = _mm_add_ps(fjx0,tx);
909             fjy0             = _mm_add_ps(fjy0,ty);
910             fjz0             = _mm_add_ps(fjz0,tz);
911
912             }
913
914             /**************************
915              * CALCULATE INTERACTIONS *
916              **************************/
917
918             if (gmx_mm_any_lt(rsq11,rcutoff2))
919             {
920
921             /* REACTION-FIELD ELECTROSTATICS */
922             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
923             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
924
925             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
926
927             /* Update potential sum for this i atom from the interaction with this j atom. */
928             velec            = _mm_and_ps(velec,cutoff_mask);
929             velec            = _mm_andnot_ps(dummy_mask,velec);
930             velecsum         = _mm_add_ps(velecsum,velec);
931
932             fscal            = felec;
933
934             fscal            = _mm_and_ps(fscal,cutoff_mask);
935
936             fscal            = _mm_andnot_ps(dummy_mask,fscal);
937
938             /* Calculate temporary vectorial force */
939             tx               = _mm_mul_ps(fscal,dx11);
940             ty               = _mm_mul_ps(fscal,dy11);
941             tz               = _mm_mul_ps(fscal,dz11);
942
943             /* Update vectorial force */
944             fix1             = _mm_add_ps(fix1,tx);
945             fiy1             = _mm_add_ps(fiy1,ty);
946             fiz1             = _mm_add_ps(fiz1,tz);
947
948             fjx1             = _mm_add_ps(fjx1,tx);
949             fjy1             = _mm_add_ps(fjy1,ty);
950             fjz1             = _mm_add_ps(fjz1,tz);
951
952             }
953
954             /**************************
955              * CALCULATE INTERACTIONS *
956              **************************/
957
958             if (gmx_mm_any_lt(rsq12,rcutoff2))
959             {
960
961             /* REACTION-FIELD ELECTROSTATICS */
962             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
963             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
964
965             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
966
967             /* Update potential sum for this i atom from the interaction with this j atom. */
968             velec            = _mm_and_ps(velec,cutoff_mask);
969             velec            = _mm_andnot_ps(dummy_mask,velec);
970             velecsum         = _mm_add_ps(velecsum,velec);
971
972             fscal            = felec;
973
974             fscal            = _mm_and_ps(fscal,cutoff_mask);
975
976             fscal            = _mm_andnot_ps(dummy_mask,fscal);
977
978             /* Calculate temporary vectorial force */
979             tx               = _mm_mul_ps(fscal,dx12);
980             ty               = _mm_mul_ps(fscal,dy12);
981             tz               = _mm_mul_ps(fscal,dz12);
982
983             /* Update vectorial force */
984             fix1             = _mm_add_ps(fix1,tx);
985             fiy1             = _mm_add_ps(fiy1,ty);
986             fiz1             = _mm_add_ps(fiz1,tz);
987
988             fjx2             = _mm_add_ps(fjx2,tx);
989             fjy2             = _mm_add_ps(fjy2,ty);
990             fjz2             = _mm_add_ps(fjz2,tz);
991
992             }
993
994             /**************************
995              * CALCULATE INTERACTIONS *
996              **************************/
997
998             if (gmx_mm_any_lt(rsq20,rcutoff2))
999             {
1000
1001             /* REACTION-FIELD ELECTROSTATICS */
1002             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1003             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1004
1005             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1006
1007             /* Update potential sum for this i atom from the interaction with this j atom. */
1008             velec            = _mm_and_ps(velec,cutoff_mask);
1009             velec            = _mm_andnot_ps(dummy_mask,velec);
1010             velecsum         = _mm_add_ps(velecsum,velec);
1011
1012             fscal            = felec;
1013
1014             fscal            = _mm_and_ps(fscal,cutoff_mask);
1015
1016             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1017
1018             /* Calculate temporary vectorial force */
1019             tx               = _mm_mul_ps(fscal,dx20);
1020             ty               = _mm_mul_ps(fscal,dy20);
1021             tz               = _mm_mul_ps(fscal,dz20);
1022
1023             /* Update vectorial force */
1024             fix2             = _mm_add_ps(fix2,tx);
1025             fiy2             = _mm_add_ps(fiy2,ty);
1026             fiz2             = _mm_add_ps(fiz2,tz);
1027
1028             fjx0             = _mm_add_ps(fjx0,tx);
1029             fjy0             = _mm_add_ps(fjy0,ty);
1030             fjz0             = _mm_add_ps(fjz0,tz);
1031
1032             }
1033
1034             /**************************
1035              * CALCULATE INTERACTIONS *
1036              **************************/
1037
1038             if (gmx_mm_any_lt(rsq21,rcutoff2))
1039             {
1040
1041             /* REACTION-FIELD ELECTROSTATICS */
1042             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1043             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1044
1045             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1046
1047             /* Update potential sum for this i atom from the interaction with this j atom. */
1048             velec            = _mm_and_ps(velec,cutoff_mask);
1049             velec            = _mm_andnot_ps(dummy_mask,velec);
1050             velecsum         = _mm_add_ps(velecsum,velec);
1051
1052             fscal            = felec;
1053
1054             fscal            = _mm_and_ps(fscal,cutoff_mask);
1055
1056             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1057
1058             /* Calculate temporary vectorial force */
1059             tx               = _mm_mul_ps(fscal,dx21);
1060             ty               = _mm_mul_ps(fscal,dy21);
1061             tz               = _mm_mul_ps(fscal,dz21);
1062
1063             /* Update vectorial force */
1064             fix2             = _mm_add_ps(fix2,tx);
1065             fiy2             = _mm_add_ps(fiy2,ty);
1066             fiz2             = _mm_add_ps(fiz2,tz);
1067
1068             fjx1             = _mm_add_ps(fjx1,tx);
1069             fjy1             = _mm_add_ps(fjy1,ty);
1070             fjz1             = _mm_add_ps(fjz1,tz);
1071
1072             }
1073
1074             /**************************
1075              * CALCULATE INTERACTIONS *
1076              **************************/
1077
1078             if (gmx_mm_any_lt(rsq22,rcutoff2))
1079             {
1080
1081             /* REACTION-FIELD ELECTROSTATICS */
1082             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1083             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1084
1085             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1086
1087             /* Update potential sum for this i atom from the interaction with this j atom. */
1088             velec            = _mm_and_ps(velec,cutoff_mask);
1089             velec            = _mm_andnot_ps(dummy_mask,velec);
1090             velecsum         = _mm_add_ps(velecsum,velec);
1091
1092             fscal            = felec;
1093
1094             fscal            = _mm_and_ps(fscal,cutoff_mask);
1095
1096             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1097
1098             /* Calculate temporary vectorial force */
1099             tx               = _mm_mul_ps(fscal,dx22);
1100             ty               = _mm_mul_ps(fscal,dy22);
1101             tz               = _mm_mul_ps(fscal,dz22);
1102
1103             /* Update vectorial force */
1104             fix2             = _mm_add_ps(fix2,tx);
1105             fiy2             = _mm_add_ps(fiy2,ty);
1106             fiz2             = _mm_add_ps(fiz2,tz);
1107
1108             fjx2             = _mm_add_ps(fjx2,tx);
1109             fjy2             = _mm_add_ps(fjy2,ty);
1110             fjz2             = _mm_add_ps(fjz2,tz);
1111
1112             }
1113
1114             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1115                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1116                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1117
1118             /* Inner loop uses 342 flops */
1119         }
1120
1121         /* End of innermost loop */
1122
1123         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1124                                               f+i_coord_offset,fshift+i_shift_offset);
1125
1126         ggid                        = gid[iidx];
1127         /* Update potential energies */
1128         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1129         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1130
1131         /* Increment number of inner iterations */
1132         inneriter                  += j_index_end - j_index_start;
1133
1134         /* Outer loop uses 29 flops */
1135     }
1136
1137     /* Increment number of outer iterations */
1138     outeriter        += nri;
1139
1140     /* Update outer/inner flops */
1141
1142     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*342);
1143 }
1144 /*
1145  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
1146  * Electrostatics interaction: ReactionField
1147  * VdW interaction:            LennardJones
1148  * Geometry:                   Water3-Water3
1149  * Calculate force/pot:        Force
1150  */
1151 void
1152 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
1153                     (t_nblist * gmx_restrict                nlist,
1154                      rvec * gmx_restrict                    xx,
1155                      rvec * gmx_restrict                    ff,
1156                      t_forcerec * gmx_restrict              fr,
1157                      t_mdatoms * gmx_restrict               mdatoms,
1158                      nb_kernel_data_t * gmx_restrict        kernel_data,
1159                      t_nrnb * gmx_restrict                  nrnb)
1160 {
1161     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1162      * just 0 for non-waters.
1163      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1164      * jnr indices corresponding to data put in the four positions in the SIMD register.
1165      */
1166     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1167     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1168     int              jnrA,jnrB,jnrC,jnrD;
1169     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1170     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1171     real             shX,shY,shZ,rcutoff_scalar;
1172     real             *shiftvec,*fshift,*x,*f;
1173     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1174     int              vdwioffset0;
1175     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1176     int              vdwioffset1;
1177     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1178     int              vdwioffset2;
1179     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1180     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1181     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1182     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1183     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1184     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1185     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1186     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1187     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1188     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1189     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1190     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1191     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1192     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1193     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1194     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1195     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1196     real             *charge;
1197     int              nvdwtype;
1198     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1199     int              *vdwtype;
1200     real             *vdwparam;
1201     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1202     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1203     __m128           dummy_mask,cutoff_mask;
1204     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1205     __m128           one     = _mm_set1_ps(1.0);
1206     __m128           two     = _mm_set1_ps(2.0);
1207     x                = xx[0];
1208     f                = ff[0];
1209
1210     nri              = nlist->nri;
1211     iinr             = nlist->iinr;
1212     jindex           = nlist->jindex;
1213     jjnr             = nlist->jjnr;
1214     shiftidx         = nlist->shift;
1215     gid              = nlist->gid;
1216     shiftvec         = fr->shift_vec[0];
1217     fshift           = fr->fshift[0];
1218     facel            = _mm_set1_ps(fr->epsfac);
1219     charge           = mdatoms->chargeA;
1220     krf              = _mm_set1_ps(fr->ic->k_rf);
1221     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1222     crf              = _mm_set1_ps(fr->ic->c_rf);
1223     nvdwtype         = fr->ntype;
1224     vdwparam         = fr->nbfp;
1225     vdwtype          = mdatoms->typeA;
1226
1227     /* Setup water-specific parameters */
1228     inr              = nlist->iinr[0];
1229     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1230     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1231     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1232     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1233
1234     jq0              = _mm_set1_ps(charge[inr+0]);
1235     jq1              = _mm_set1_ps(charge[inr+1]);
1236     jq2              = _mm_set1_ps(charge[inr+2]);
1237     vdwjidx0A        = 2*vdwtype[inr+0];
1238     qq00             = _mm_mul_ps(iq0,jq0);
1239     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1240     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1241     qq01             = _mm_mul_ps(iq0,jq1);
1242     qq02             = _mm_mul_ps(iq0,jq2);
1243     qq10             = _mm_mul_ps(iq1,jq0);
1244     qq11             = _mm_mul_ps(iq1,jq1);
1245     qq12             = _mm_mul_ps(iq1,jq2);
1246     qq20             = _mm_mul_ps(iq2,jq0);
1247     qq21             = _mm_mul_ps(iq2,jq1);
1248     qq22             = _mm_mul_ps(iq2,jq2);
1249
1250     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1251     rcutoff_scalar   = fr->rcoulomb;
1252     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1253     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1254
1255     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1256     rvdw             = _mm_set1_ps(fr->rvdw);
1257
1258     /* Avoid stupid compiler warnings */
1259     jnrA = jnrB = jnrC = jnrD = 0;
1260     j_coord_offsetA = 0;
1261     j_coord_offsetB = 0;
1262     j_coord_offsetC = 0;
1263     j_coord_offsetD = 0;
1264
1265     outeriter        = 0;
1266     inneriter        = 0;
1267
1268     /* Start outer loop over neighborlists */
1269     for(iidx=0; iidx<nri; iidx++)
1270     {
1271         /* Load shift vector for this list */
1272         i_shift_offset   = DIM*shiftidx[iidx];
1273         shX              = shiftvec[i_shift_offset+XX];
1274         shY              = shiftvec[i_shift_offset+YY];
1275         shZ              = shiftvec[i_shift_offset+ZZ];
1276
1277         /* Load limits for loop over neighbors */
1278         j_index_start    = jindex[iidx];
1279         j_index_end      = jindex[iidx+1];
1280
1281         /* Get outer coordinate index */
1282         inr              = iinr[iidx];
1283         i_coord_offset   = DIM*inr;
1284
1285         /* Load i particle coords and add shift vector */
1286         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1287         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1288         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1289         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1290         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1291         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1292         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1293         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1294         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1295
1296         fix0             = _mm_setzero_ps();
1297         fiy0             = _mm_setzero_ps();
1298         fiz0             = _mm_setzero_ps();
1299         fix1             = _mm_setzero_ps();
1300         fiy1             = _mm_setzero_ps();
1301         fiz1             = _mm_setzero_ps();
1302         fix2             = _mm_setzero_ps();
1303         fiy2             = _mm_setzero_ps();
1304         fiz2             = _mm_setzero_ps();
1305
1306         /* Start inner kernel loop */
1307         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1308         {
1309
1310             /* Get j neighbor index, and coordinate index */
1311             jnrA             = jjnr[jidx];
1312             jnrB             = jjnr[jidx+1];
1313             jnrC             = jjnr[jidx+2];
1314             jnrD             = jjnr[jidx+3];
1315
1316             j_coord_offsetA  = DIM*jnrA;
1317             j_coord_offsetB  = DIM*jnrB;
1318             j_coord_offsetC  = DIM*jnrC;
1319             j_coord_offsetD  = DIM*jnrD;
1320
1321             /* load j atom coordinates */
1322             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1323                                               x+j_coord_offsetC,x+j_coord_offsetD,
1324                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1325
1326             /* Calculate displacement vector */
1327             dx00             = _mm_sub_ps(ix0,jx0);
1328             dy00             = _mm_sub_ps(iy0,jy0);
1329             dz00             = _mm_sub_ps(iz0,jz0);
1330             dx01             = _mm_sub_ps(ix0,jx1);
1331             dy01             = _mm_sub_ps(iy0,jy1);
1332             dz01             = _mm_sub_ps(iz0,jz1);
1333             dx02             = _mm_sub_ps(ix0,jx2);
1334             dy02             = _mm_sub_ps(iy0,jy2);
1335             dz02             = _mm_sub_ps(iz0,jz2);
1336             dx10             = _mm_sub_ps(ix1,jx0);
1337             dy10             = _mm_sub_ps(iy1,jy0);
1338             dz10             = _mm_sub_ps(iz1,jz0);
1339             dx11             = _mm_sub_ps(ix1,jx1);
1340             dy11             = _mm_sub_ps(iy1,jy1);
1341             dz11             = _mm_sub_ps(iz1,jz1);
1342             dx12             = _mm_sub_ps(ix1,jx2);
1343             dy12             = _mm_sub_ps(iy1,jy2);
1344             dz12             = _mm_sub_ps(iz1,jz2);
1345             dx20             = _mm_sub_ps(ix2,jx0);
1346             dy20             = _mm_sub_ps(iy2,jy0);
1347             dz20             = _mm_sub_ps(iz2,jz0);
1348             dx21             = _mm_sub_ps(ix2,jx1);
1349             dy21             = _mm_sub_ps(iy2,jy1);
1350             dz21             = _mm_sub_ps(iz2,jz1);
1351             dx22             = _mm_sub_ps(ix2,jx2);
1352             dy22             = _mm_sub_ps(iy2,jy2);
1353             dz22             = _mm_sub_ps(iz2,jz2);
1354
1355             /* Calculate squared distance and things based on it */
1356             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1357             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1358             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1359             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1360             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1361             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1362             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1363             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1364             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1365
1366             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1367             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1368             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1369             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1370             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1371             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1372             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1373             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1374             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1375
1376             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1377             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1378             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1379             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1380             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1381             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1382             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1383             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1384             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1385
1386             fjx0             = _mm_setzero_ps();
1387             fjy0             = _mm_setzero_ps();
1388             fjz0             = _mm_setzero_ps();
1389             fjx1             = _mm_setzero_ps();
1390             fjy1             = _mm_setzero_ps();
1391             fjz1             = _mm_setzero_ps();
1392             fjx2             = _mm_setzero_ps();
1393             fjy2             = _mm_setzero_ps();
1394             fjz2             = _mm_setzero_ps();
1395
1396             /**************************
1397              * CALCULATE INTERACTIONS *
1398              **************************/
1399
1400             if (gmx_mm_any_lt(rsq00,rcutoff2))
1401             {
1402
1403             /* REACTION-FIELD ELECTROSTATICS */
1404             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1405
1406             /* LENNARD-JONES DISPERSION/REPULSION */
1407
1408             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1409             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1410
1411             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1412
1413             fscal            = _mm_add_ps(felec,fvdw);
1414
1415             fscal            = _mm_and_ps(fscal,cutoff_mask);
1416
1417             /* Calculate temporary vectorial force */
1418             tx               = _mm_mul_ps(fscal,dx00);
1419             ty               = _mm_mul_ps(fscal,dy00);
1420             tz               = _mm_mul_ps(fscal,dz00);
1421
1422             /* Update vectorial force */
1423             fix0             = _mm_add_ps(fix0,tx);
1424             fiy0             = _mm_add_ps(fiy0,ty);
1425             fiz0             = _mm_add_ps(fiz0,tz);
1426
1427             fjx0             = _mm_add_ps(fjx0,tx);
1428             fjy0             = _mm_add_ps(fjy0,ty);
1429             fjz0             = _mm_add_ps(fjz0,tz);
1430
1431             }
1432
1433             /**************************
1434              * CALCULATE INTERACTIONS *
1435              **************************/
1436
1437             if (gmx_mm_any_lt(rsq01,rcutoff2))
1438             {
1439
1440             /* REACTION-FIELD ELECTROSTATICS */
1441             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1442
1443             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1444
1445             fscal            = felec;
1446
1447             fscal            = _mm_and_ps(fscal,cutoff_mask);
1448
1449             /* Calculate temporary vectorial force */
1450             tx               = _mm_mul_ps(fscal,dx01);
1451             ty               = _mm_mul_ps(fscal,dy01);
1452             tz               = _mm_mul_ps(fscal,dz01);
1453
1454             /* Update vectorial force */
1455             fix0             = _mm_add_ps(fix0,tx);
1456             fiy0             = _mm_add_ps(fiy0,ty);
1457             fiz0             = _mm_add_ps(fiz0,tz);
1458
1459             fjx1             = _mm_add_ps(fjx1,tx);
1460             fjy1             = _mm_add_ps(fjy1,ty);
1461             fjz1             = _mm_add_ps(fjz1,tz);
1462
1463             }
1464
1465             /**************************
1466              * CALCULATE INTERACTIONS *
1467              **************************/
1468
1469             if (gmx_mm_any_lt(rsq02,rcutoff2))
1470             {
1471
1472             /* REACTION-FIELD ELECTROSTATICS */
1473             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1474
1475             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1476
1477             fscal            = felec;
1478
1479             fscal            = _mm_and_ps(fscal,cutoff_mask);
1480
1481             /* Calculate temporary vectorial force */
1482             tx               = _mm_mul_ps(fscal,dx02);
1483             ty               = _mm_mul_ps(fscal,dy02);
1484             tz               = _mm_mul_ps(fscal,dz02);
1485
1486             /* Update vectorial force */
1487             fix0             = _mm_add_ps(fix0,tx);
1488             fiy0             = _mm_add_ps(fiy0,ty);
1489             fiz0             = _mm_add_ps(fiz0,tz);
1490
1491             fjx2             = _mm_add_ps(fjx2,tx);
1492             fjy2             = _mm_add_ps(fjy2,ty);
1493             fjz2             = _mm_add_ps(fjz2,tz);
1494
1495             }
1496
1497             /**************************
1498              * CALCULATE INTERACTIONS *
1499              **************************/
1500
1501             if (gmx_mm_any_lt(rsq10,rcutoff2))
1502             {
1503
1504             /* REACTION-FIELD ELECTROSTATICS */
1505             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1506
1507             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1508
1509             fscal            = felec;
1510
1511             fscal            = _mm_and_ps(fscal,cutoff_mask);
1512
1513             /* Calculate temporary vectorial force */
1514             tx               = _mm_mul_ps(fscal,dx10);
1515             ty               = _mm_mul_ps(fscal,dy10);
1516             tz               = _mm_mul_ps(fscal,dz10);
1517
1518             /* Update vectorial force */
1519             fix1             = _mm_add_ps(fix1,tx);
1520             fiy1             = _mm_add_ps(fiy1,ty);
1521             fiz1             = _mm_add_ps(fiz1,tz);
1522
1523             fjx0             = _mm_add_ps(fjx0,tx);
1524             fjy0             = _mm_add_ps(fjy0,ty);
1525             fjz0             = _mm_add_ps(fjz0,tz);
1526
1527             }
1528
1529             /**************************
1530              * CALCULATE INTERACTIONS *
1531              **************************/
1532
1533             if (gmx_mm_any_lt(rsq11,rcutoff2))
1534             {
1535
1536             /* REACTION-FIELD ELECTROSTATICS */
1537             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1538
1539             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1540
1541             fscal            = felec;
1542
1543             fscal            = _mm_and_ps(fscal,cutoff_mask);
1544
1545             /* Calculate temporary vectorial force */
1546             tx               = _mm_mul_ps(fscal,dx11);
1547             ty               = _mm_mul_ps(fscal,dy11);
1548             tz               = _mm_mul_ps(fscal,dz11);
1549
1550             /* Update vectorial force */
1551             fix1             = _mm_add_ps(fix1,tx);
1552             fiy1             = _mm_add_ps(fiy1,ty);
1553             fiz1             = _mm_add_ps(fiz1,tz);
1554
1555             fjx1             = _mm_add_ps(fjx1,tx);
1556             fjy1             = _mm_add_ps(fjy1,ty);
1557             fjz1             = _mm_add_ps(fjz1,tz);
1558
1559             }
1560
1561             /**************************
1562              * CALCULATE INTERACTIONS *
1563              **************************/
1564
1565             if (gmx_mm_any_lt(rsq12,rcutoff2))
1566             {
1567
1568             /* REACTION-FIELD ELECTROSTATICS */
1569             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1570
1571             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1572
1573             fscal            = felec;
1574
1575             fscal            = _mm_and_ps(fscal,cutoff_mask);
1576
1577             /* Calculate temporary vectorial force */
1578             tx               = _mm_mul_ps(fscal,dx12);
1579             ty               = _mm_mul_ps(fscal,dy12);
1580             tz               = _mm_mul_ps(fscal,dz12);
1581
1582             /* Update vectorial force */
1583             fix1             = _mm_add_ps(fix1,tx);
1584             fiy1             = _mm_add_ps(fiy1,ty);
1585             fiz1             = _mm_add_ps(fiz1,tz);
1586
1587             fjx2             = _mm_add_ps(fjx2,tx);
1588             fjy2             = _mm_add_ps(fjy2,ty);
1589             fjz2             = _mm_add_ps(fjz2,tz);
1590
1591             }
1592
1593             /**************************
1594              * CALCULATE INTERACTIONS *
1595              **************************/
1596
1597             if (gmx_mm_any_lt(rsq20,rcutoff2))
1598             {
1599
1600             /* REACTION-FIELD ELECTROSTATICS */
1601             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1602
1603             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1604
1605             fscal            = felec;
1606
1607             fscal            = _mm_and_ps(fscal,cutoff_mask);
1608
1609             /* Calculate temporary vectorial force */
1610             tx               = _mm_mul_ps(fscal,dx20);
1611             ty               = _mm_mul_ps(fscal,dy20);
1612             tz               = _mm_mul_ps(fscal,dz20);
1613
1614             /* Update vectorial force */
1615             fix2             = _mm_add_ps(fix2,tx);
1616             fiy2             = _mm_add_ps(fiy2,ty);
1617             fiz2             = _mm_add_ps(fiz2,tz);
1618
1619             fjx0             = _mm_add_ps(fjx0,tx);
1620             fjy0             = _mm_add_ps(fjy0,ty);
1621             fjz0             = _mm_add_ps(fjz0,tz);
1622
1623             }
1624
1625             /**************************
1626              * CALCULATE INTERACTIONS *
1627              **************************/
1628
1629             if (gmx_mm_any_lt(rsq21,rcutoff2))
1630             {
1631
1632             /* REACTION-FIELD ELECTROSTATICS */
1633             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1634
1635             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1636
1637             fscal            = felec;
1638
1639             fscal            = _mm_and_ps(fscal,cutoff_mask);
1640
1641             /* Calculate temporary vectorial force */
1642             tx               = _mm_mul_ps(fscal,dx21);
1643             ty               = _mm_mul_ps(fscal,dy21);
1644             tz               = _mm_mul_ps(fscal,dz21);
1645
1646             /* Update vectorial force */
1647             fix2             = _mm_add_ps(fix2,tx);
1648             fiy2             = _mm_add_ps(fiy2,ty);
1649             fiz2             = _mm_add_ps(fiz2,tz);
1650
1651             fjx1             = _mm_add_ps(fjx1,tx);
1652             fjy1             = _mm_add_ps(fjy1,ty);
1653             fjz1             = _mm_add_ps(fjz1,tz);
1654
1655             }
1656
1657             /**************************
1658              * CALCULATE INTERACTIONS *
1659              **************************/
1660
1661             if (gmx_mm_any_lt(rsq22,rcutoff2))
1662             {
1663
1664             /* REACTION-FIELD ELECTROSTATICS */
1665             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1666
1667             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1668
1669             fscal            = felec;
1670
1671             fscal            = _mm_and_ps(fscal,cutoff_mask);
1672
1673             /* Calculate temporary vectorial force */
1674             tx               = _mm_mul_ps(fscal,dx22);
1675             ty               = _mm_mul_ps(fscal,dy22);
1676             tz               = _mm_mul_ps(fscal,dz22);
1677
1678             /* Update vectorial force */
1679             fix2             = _mm_add_ps(fix2,tx);
1680             fiy2             = _mm_add_ps(fiy2,ty);
1681             fiz2             = _mm_add_ps(fiz2,tz);
1682
1683             fjx2             = _mm_add_ps(fjx2,tx);
1684             fjy2             = _mm_add_ps(fjy2,ty);
1685             fjz2             = _mm_add_ps(fjz2,tz);
1686
1687             }
1688
1689             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1690                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1691                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1692
1693             /* Inner loop uses 277 flops */
1694         }
1695
1696         if(jidx<j_index_end)
1697         {
1698
1699             /* Get j neighbor index, and coordinate index */
1700             jnrA             = jjnr[jidx];
1701             jnrB             = jjnr[jidx+1];
1702             jnrC             = jjnr[jidx+2];
1703             jnrD             = jjnr[jidx+3];
1704
1705             /* Sign of each element will be negative for non-real atoms.
1706              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1707              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1708              */
1709             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1710             jnrA       = (jnrA>=0) ? jnrA : 0;
1711             jnrB       = (jnrB>=0) ? jnrB : 0;
1712             jnrC       = (jnrC>=0) ? jnrC : 0;
1713             jnrD       = (jnrD>=0) ? jnrD : 0;
1714
1715             j_coord_offsetA  = DIM*jnrA;
1716             j_coord_offsetB  = DIM*jnrB;
1717             j_coord_offsetC  = DIM*jnrC;
1718             j_coord_offsetD  = DIM*jnrD;
1719
1720             /* load j atom coordinates */
1721             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1722                                               x+j_coord_offsetC,x+j_coord_offsetD,
1723                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1724
1725             /* Calculate displacement vector */
1726             dx00             = _mm_sub_ps(ix0,jx0);
1727             dy00             = _mm_sub_ps(iy0,jy0);
1728             dz00             = _mm_sub_ps(iz0,jz0);
1729             dx01             = _mm_sub_ps(ix0,jx1);
1730             dy01             = _mm_sub_ps(iy0,jy1);
1731             dz01             = _mm_sub_ps(iz0,jz1);
1732             dx02             = _mm_sub_ps(ix0,jx2);
1733             dy02             = _mm_sub_ps(iy0,jy2);
1734             dz02             = _mm_sub_ps(iz0,jz2);
1735             dx10             = _mm_sub_ps(ix1,jx0);
1736             dy10             = _mm_sub_ps(iy1,jy0);
1737             dz10             = _mm_sub_ps(iz1,jz0);
1738             dx11             = _mm_sub_ps(ix1,jx1);
1739             dy11             = _mm_sub_ps(iy1,jy1);
1740             dz11             = _mm_sub_ps(iz1,jz1);
1741             dx12             = _mm_sub_ps(ix1,jx2);
1742             dy12             = _mm_sub_ps(iy1,jy2);
1743             dz12             = _mm_sub_ps(iz1,jz2);
1744             dx20             = _mm_sub_ps(ix2,jx0);
1745             dy20             = _mm_sub_ps(iy2,jy0);
1746             dz20             = _mm_sub_ps(iz2,jz0);
1747             dx21             = _mm_sub_ps(ix2,jx1);
1748             dy21             = _mm_sub_ps(iy2,jy1);
1749             dz21             = _mm_sub_ps(iz2,jz1);
1750             dx22             = _mm_sub_ps(ix2,jx2);
1751             dy22             = _mm_sub_ps(iy2,jy2);
1752             dz22             = _mm_sub_ps(iz2,jz2);
1753
1754             /* Calculate squared distance and things based on it */
1755             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1756             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1757             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1758             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1759             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1760             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1761             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1762             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1763             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1764
1765             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1766             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1767             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1768             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1769             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1770             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1771             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1772             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1773             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1774
1775             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1776             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1777             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1778             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1779             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1780             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1781             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1782             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1783             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1784
1785             fjx0             = _mm_setzero_ps();
1786             fjy0             = _mm_setzero_ps();
1787             fjz0             = _mm_setzero_ps();
1788             fjx1             = _mm_setzero_ps();
1789             fjy1             = _mm_setzero_ps();
1790             fjz1             = _mm_setzero_ps();
1791             fjx2             = _mm_setzero_ps();
1792             fjy2             = _mm_setzero_ps();
1793             fjz2             = _mm_setzero_ps();
1794
1795             /**************************
1796              * CALCULATE INTERACTIONS *
1797              **************************/
1798
1799             if (gmx_mm_any_lt(rsq00,rcutoff2))
1800             {
1801
1802             /* REACTION-FIELD ELECTROSTATICS */
1803             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1804
1805             /* LENNARD-JONES DISPERSION/REPULSION */
1806
1807             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1808             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1809
1810             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1811
1812             fscal            = _mm_add_ps(felec,fvdw);
1813
1814             fscal            = _mm_and_ps(fscal,cutoff_mask);
1815
1816             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1817
1818             /* Calculate temporary vectorial force */
1819             tx               = _mm_mul_ps(fscal,dx00);
1820             ty               = _mm_mul_ps(fscal,dy00);
1821             tz               = _mm_mul_ps(fscal,dz00);
1822
1823             /* Update vectorial force */
1824             fix0             = _mm_add_ps(fix0,tx);
1825             fiy0             = _mm_add_ps(fiy0,ty);
1826             fiz0             = _mm_add_ps(fiz0,tz);
1827
1828             fjx0             = _mm_add_ps(fjx0,tx);
1829             fjy0             = _mm_add_ps(fjy0,ty);
1830             fjz0             = _mm_add_ps(fjz0,tz);
1831
1832             }
1833
1834             /**************************
1835              * CALCULATE INTERACTIONS *
1836              **************************/
1837
1838             if (gmx_mm_any_lt(rsq01,rcutoff2))
1839             {
1840
1841             /* REACTION-FIELD ELECTROSTATICS */
1842             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1843
1844             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1845
1846             fscal            = felec;
1847
1848             fscal            = _mm_and_ps(fscal,cutoff_mask);
1849
1850             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1851
1852             /* Calculate temporary vectorial force */
1853             tx               = _mm_mul_ps(fscal,dx01);
1854             ty               = _mm_mul_ps(fscal,dy01);
1855             tz               = _mm_mul_ps(fscal,dz01);
1856
1857             /* Update vectorial force */
1858             fix0             = _mm_add_ps(fix0,tx);
1859             fiy0             = _mm_add_ps(fiy0,ty);
1860             fiz0             = _mm_add_ps(fiz0,tz);
1861
1862             fjx1             = _mm_add_ps(fjx1,tx);
1863             fjy1             = _mm_add_ps(fjy1,ty);
1864             fjz1             = _mm_add_ps(fjz1,tz);
1865
1866             }
1867
1868             /**************************
1869              * CALCULATE INTERACTIONS *
1870              **************************/
1871
1872             if (gmx_mm_any_lt(rsq02,rcutoff2))
1873             {
1874
1875             /* REACTION-FIELD ELECTROSTATICS */
1876             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1877
1878             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1879
1880             fscal            = felec;
1881
1882             fscal            = _mm_and_ps(fscal,cutoff_mask);
1883
1884             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1885
1886             /* Calculate temporary vectorial force */
1887             tx               = _mm_mul_ps(fscal,dx02);
1888             ty               = _mm_mul_ps(fscal,dy02);
1889             tz               = _mm_mul_ps(fscal,dz02);
1890
1891             /* Update vectorial force */
1892             fix0             = _mm_add_ps(fix0,tx);
1893             fiy0             = _mm_add_ps(fiy0,ty);
1894             fiz0             = _mm_add_ps(fiz0,tz);
1895
1896             fjx2             = _mm_add_ps(fjx2,tx);
1897             fjy2             = _mm_add_ps(fjy2,ty);
1898             fjz2             = _mm_add_ps(fjz2,tz);
1899
1900             }
1901
1902             /**************************
1903              * CALCULATE INTERACTIONS *
1904              **************************/
1905
1906             if (gmx_mm_any_lt(rsq10,rcutoff2))
1907             {
1908
1909             /* REACTION-FIELD ELECTROSTATICS */
1910             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1911
1912             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1913
1914             fscal            = felec;
1915
1916             fscal            = _mm_and_ps(fscal,cutoff_mask);
1917
1918             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1919
1920             /* Calculate temporary vectorial force */
1921             tx               = _mm_mul_ps(fscal,dx10);
1922             ty               = _mm_mul_ps(fscal,dy10);
1923             tz               = _mm_mul_ps(fscal,dz10);
1924
1925             /* Update vectorial force */
1926             fix1             = _mm_add_ps(fix1,tx);
1927             fiy1             = _mm_add_ps(fiy1,ty);
1928             fiz1             = _mm_add_ps(fiz1,tz);
1929
1930             fjx0             = _mm_add_ps(fjx0,tx);
1931             fjy0             = _mm_add_ps(fjy0,ty);
1932             fjz0             = _mm_add_ps(fjz0,tz);
1933
1934             }
1935
1936             /**************************
1937              * CALCULATE INTERACTIONS *
1938              **************************/
1939
1940             if (gmx_mm_any_lt(rsq11,rcutoff2))
1941             {
1942
1943             /* REACTION-FIELD ELECTROSTATICS */
1944             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1945
1946             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1947
1948             fscal            = felec;
1949
1950             fscal            = _mm_and_ps(fscal,cutoff_mask);
1951
1952             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1953
1954             /* Calculate temporary vectorial force */
1955             tx               = _mm_mul_ps(fscal,dx11);
1956             ty               = _mm_mul_ps(fscal,dy11);
1957             tz               = _mm_mul_ps(fscal,dz11);
1958
1959             /* Update vectorial force */
1960             fix1             = _mm_add_ps(fix1,tx);
1961             fiy1             = _mm_add_ps(fiy1,ty);
1962             fiz1             = _mm_add_ps(fiz1,tz);
1963
1964             fjx1             = _mm_add_ps(fjx1,tx);
1965             fjy1             = _mm_add_ps(fjy1,ty);
1966             fjz1             = _mm_add_ps(fjz1,tz);
1967
1968             }
1969
1970             /**************************
1971              * CALCULATE INTERACTIONS *
1972              **************************/
1973
1974             if (gmx_mm_any_lt(rsq12,rcutoff2))
1975             {
1976
1977             /* REACTION-FIELD ELECTROSTATICS */
1978             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1979
1980             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1981
1982             fscal            = felec;
1983
1984             fscal            = _mm_and_ps(fscal,cutoff_mask);
1985
1986             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1987
1988             /* Calculate temporary vectorial force */
1989             tx               = _mm_mul_ps(fscal,dx12);
1990             ty               = _mm_mul_ps(fscal,dy12);
1991             tz               = _mm_mul_ps(fscal,dz12);
1992
1993             /* Update vectorial force */
1994             fix1             = _mm_add_ps(fix1,tx);
1995             fiy1             = _mm_add_ps(fiy1,ty);
1996             fiz1             = _mm_add_ps(fiz1,tz);
1997
1998             fjx2             = _mm_add_ps(fjx2,tx);
1999             fjy2             = _mm_add_ps(fjy2,ty);
2000             fjz2             = _mm_add_ps(fjz2,tz);
2001
2002             }
2003
2004             /**************************
2005              * CALCULATE INTERACTIONS *
2006              **************************/
2007
2008             if (gmx_mm_any_lt(rsq20,rcutoff2))
2009             {
2010
2011             /* REACTION-FIELD ELECTROSTATICS */
2012             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2013
2014             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2015
2016             fscal            = felec;
2017
2018             fscal            = _mm_and_ps(fscal,cutoff_mask);
2019
2020             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2021
2022             /* Calculate temporary vectorial force */
2023             tx               = _mm_mul_ps(fscal,dx20);
2024             ty               = _mm_mul_ps(fscal,dy20);
2025             tz               = _mm_mul_ps(fscal,dz20);
2026
2027             /* Update vectorial force */
2028             fix2             = _mm_add_ps(fix2,tx);
2029             fiy2             = _mm_add_ps(fiy2,ty);
2030             fiz2             = _mm_add_ps(fiz2,tz);
2031
2032             fjx0             = _mm_add_ps(fjx0,tx);
2033             fjy0             = _mm_add_ps(fjy0,ty);
2034             fjz0             = _mm_add_ps(fjz0,tz);
2035
2036             }
2037
2038             /**************************
2039              * CALCULATE INTERACTIONS *
2040              **************************/
2041
2042             if (gmx_mm_any_lt(rsq21,rcutoff2))
2043             {
2044
2045             /* REACTION-FIELD ELECTROSTATICS */
2046             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2047
2048             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2049
2050             fscal            = felec;
2051
2052             fscal            = _mm_and_ps(fscal,cutoff_mask);
2053
2054             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2055
2056             /* Calculate temporary vectorial force */
2057             tx               = _mm_mul_ps(fscal,dx21);
2058             ty               = _mm_mul_ps(fscal,dy21);
2059             tz               = _mm_mul_ps(fscal,dz21);
2060
2061             /* Update vectorial force */
2062             fix2             = _mm_add_ps(fix2,tx);
2063             fiy2             = _mm_add_ps(fiy2,ty);
2064             fiz2             = _mm_add_ps(fiz2,tz);
2065
2066             fjx1             = _mm_add_ps(fjx1,tx);
2067             fjy1             = _mm_add_ps(fjy1,ty);
2068             fjz1             = _mm_add_ps(fjz1,tz);
2069
2070             }
2071
2072             /**************************
2073              * CALCULATE INTERACTIONS *
2074              **************************/
2075
2076             if (gmx_mm_any_lt(rsq22,rcutoff2))
2077             {
2078
2079             /* REACTION-FIELD ELECTROSTATICS */
2080             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2081
2082             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2083
2084             fscal            = felec;
2085
2086             fscal            = _mm_and_ps(fscal,cutoff_mask);
2087
2088             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2089
2090             /* Calculate temporary vectorial force */
2091             tx               = _mm_mul_ps(fscal,dx22);
2092             ty               = _mm_mul_ps(fscal,dy22);
2093             tz               = _mm_mul_ps(fscal,dz22);
2094
2095             /* Update vectorial force */
2096             fix2             = _mm_add_ps(fix2,tx);
2097             fiy2             = _mm_add_ps(fiy2,ty);
2098             fiz2             = _mm_add_ps(fiz2,tz);
2099
2100             fjx2             = _mm_add_ps(fjx2,tx);
2101             fjy2             = _mm_add_ps(fjy2,ty);
2102             fjz2             = _mm_add_ps(fjz2,tz);
2103
2104             }
2105
2106             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2107                                                    f+j_coord_offsetC,f+j_coord_offsetD,
2108                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2109
2110             /* Inner loop uses 277 flops */
2111         }
2112
2113         /* End of innermost loop */
2114
2115         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2116                                               f+i_coord_offset,fshift+i_shift_offset);
2117
2118         /* Increment number of inner iterations */
2119         inneriter                  += j_index_end - j_index_start;
2120
2121         /* Outer loop uses 27 flops */
2122     }
2123
2124     /* Increment number of outer iterations */
2125     outeriter        += nri;
2126
2127     /* Update outer/inner flops */
2128
2129     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*277);
2130 }