09571acb63f81ebbdd0026b96c5b88197620e17b
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128           dummy_mask,cutoff_mask;
113     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114     __m128           one     = _mm_set1_ps(1.0);
115     __m128           two     = _mm_set1_ps(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm_set1_ps(fr->epsfac);
128     charge           = mdatoms->chargeA;
129     krf              = _mm_set1_ps(fr->ic->k_rf);
130     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
131     crf              = _mm_set1_ps(fr->ic->c_rf);
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     /* Setup water-specific parameters */
137     inr              = nlist->iinr[0];
138     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
139     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
140     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
141     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
142
143     jq0              = _mm_set1_ps(charge[inr+0]);
144     jq1              = _mm_set1_ps(charge[inr+1]);
145     jq2              = _mm_set1_ps(charge[inr+2]);
146     vdwjidx0A        = 2*vdwtype[inr+0];
147     qq00             = _mm_mul_ps(iq0,jq0);
148     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
149     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
150     qq01             = _mm_mul_ps(iq0,jq1);
151     qq02             = _mm_mul_ps(iq0,jq2);
152     qq10             = _mm_mul_ps(iq1,jq0);
153     qq11             = _mm_mul_ps(iq1,jq1);
154     qq12             = _mm_mul_ps(iq1,jq2);
155     qq20             = _mm_mul_ps(iq2,jq0);
156     qq21             = _mm_mul_ps(iq2,jq1);
157     qq22             = _mm_mul_ps(iq2,jq2);
158
159     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
160     rcutoff_scalar   = fr->rcoulomb;
161     rcutoff          = _mm_set1_ps(rcutoff_scalar);
162     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
163
164     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
165     rvdw             = _mm_set1_ps(fr->rvdw);
166
167     /* Avoid stupid compiler warnings */
168     jnrA = jnrB = jnrC = jnrD = 0;
169     j_coord_offsetA = 0;
170     j_coord_offsetB = 0;
171     j_coord_offsetC = 0;
172     j_coord_offsetD = 0;
173
174     outeriter        = 0;
175     inneriter        = 0;
176
177     for(iidx=0;iidx<4*DIM;iidx++)
178     {
179         scratch[iidx] = 0.0;
180     }  
181
182     /* Start outer loop over neighborlists */
183     for(iidx=0; iidx<nri; iidx++)
184     {
185         /* Load shift vector for this list */
186         i_shift_offset   = DIM*shiftidx[iidx];
187
188         /* Load limits for loop over neighbors */
189         j_index_start    = jindex[iidx];
190         j_index_end      = jindex[iidx+1];
191
192         /* Get outer coordinate index */
193         inr              = iinr[iidx];
194         i_coord_offset   = DIM*inr;
195
196         /* Load i particle coords and add shift vector */
197         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
198                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
199         
200         fix0             = _mm_setzero_ps();
201         fiy0             = _mm_setzero_ps();
202         fiz0             = _mm_setzero_ps();
203         fix1             = _mm_setzero_ps();
204         fiy1             = _mm_setzero_ps();
205         fiz1             = _mm_setzero_ps();
206         fix2             = _mm_setzero_ps();
207         fiy2             = _mm_setzero_ps();
208         fiz2             = _mm_setzero_ps();
209
210         /* Reset potential sums */
211         velecsum         = _mm_setzero_ps();
212         vvdwsum          = _mm_setzero_ps();
213
214         /* Start inner kernel loop */
215         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
216         {
217
218             /* Get j neighbor index, and coordinate index */
219             jnrA             = jjnr[jidx];
220             jnrB             = jjnr[jidx+1];
221             jnrC             = jjnr[jidx+2];
222             jnrD             = jjnr[jidx+3];
223             j_coord_offsetA  = DIM*jnrA;
224             j_coord_offsetB  = DIM*jnrB;
225             j_coord_offsetC  = DIM*jnrC;
226             j_coord_offsetD  = DIM*jnrD;
227
228             /* load j atom coordinates */
229             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
230                                               x+j_coord_offsetC,x+j_coord_offsetD,
231                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
232
233             /* Calculate displacement vector */
234             dx00             = _mm_sub_ps(ix0,jx0);
235             dy00             = _mm_sub_ps(iy0,jy0);
236             dz00             = _mm_sub_ps(iz0,jz0);
237             dx01             = _mm_sub_ps(ix0,jx1);
238             dy01             = _mm_sub_ps(iy0,jy1);
239             dz01             = _mm_sub_ps(iz0,jz1);
240             dx02             = _mm_sub_ps(ix0,jx2);
241             dy02             = _mm_sub_ps(iy0,jy2);
242             dz02             = _mm_sub_ps(iz0,jz2);
243             dx10             = _mm_sub_ps(ix1,jx0);
244             dy10             = _mm_sub_ps(iy1,jy0);
245             dz10             = _mm_sub_ps(iz1,jz0);
246             dx11             = _mm_sub_ps(ix1,jx1);
247             dy11             = _mm_sub_ps(iy1,jy1);
248             dz11             = _mm_sub_ps(iz1,jz1);
249             dx12             = _mm_sub_ps(ix1,jx2);
250             dy12             = _mm_sub_ps(iy1,jy2);
251             dz12             = _mm_sub_ps(iz1,jz2);
252             dx20             = _mm_sub_ps(ix2,jx0);
253             dy20             = _mm_sub_ps(iy2,jy0);
254             dz20             = _mm_sub_ps(iz2,jz0);
255             dx21             = _mm_sub_ps(ix2,jx1);
256             dy21             = _mm_sub_ps(iy2,jy1);
257             dz21             = _mm_sub_ps(iz2,jz1);
258             dx22             = _mm_sub_ps(ix2,jx2);
259             dy22             = _mm_sub_ps(iy2,jy2);
260             dz22             = _mm_sub_ps(iz2,jz2);
261
262             /* Calculate squared distance and things based on it */
263             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
264             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
265             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
266             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
267             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
270             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
272
273             rinv00           = gmx_mm_invsqrt_ps(rsq00);
274             rinv01           = gmx_mm_invsqrt_ps(rsq01);
275             rinv02           = gmx_mm_invsqrt_ps(rsq02);
276             rinv10           = gmx_mm_invsqrt_ps(rsq10);
277             rinv11           = gmx_mm_invsqrt_ps(rsq11);
278             rinv12           = gmx_mm_invsqrt_ps(rsq12);
279             rinv20           = gmx_mm_invsqrt_ps(rsq20);
280             rinv21           = gmx_mm_invsqrt_ps(rsq21);
281             rinv22           = gmx_mm_invsqrt_ps(rsq22);
282
283             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
284             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
285             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
286             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
287             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
288             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
289             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
290             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
291             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
292
293             fjx0             = _mm_setzero_ps();
294             fjy0             = _mm_setzero_ps();
295             fjz0             = _mm_setzero_ps();
296             fjx1             = _mm_setzero_ps();
297             fjy1             = _mm_setzero_ps();
298             fjz1             = _mm_setzero_ps();
299             fjx2             = _mm_setzero_ps();
300             fjy2             = _mm_setzero_ps();
301             fjz2             = _mm_setzero_ps();
302
303             /**************************
304              * CALCULATE INTERACTIONS *
305              **************************/
306
307             if (gmx_mm_any_lt(rsq00,rcutoff2))
308             {
309
310             /* REACTION-FIELD ELECTROSTATICS */
311             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
312             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
313
314             /* LENNARD-JONES DISPERSION/REPULSION */
315
316             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
317             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
318             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
319             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
320                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
321             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
322
323             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
324
325             /* Update potential sum for this i atom from the interaction with this j atom. */
326             velec            = _mm_and_ps(velec,cutoff_mask);
327             velecsum         = _mm_add_ps(velecsum,velec);
328             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
329             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
330
331             fscal            = _mm_add_ps(felec,fvdw);
332
333             fscal            = _mm_and_ps(fscal,cutoff_mask);
334
335             /* Calculate temporary vectorial force */
336             tx               = _mm_mul_ps(fscal,dx00);
337             ty               = _mm_mul_ps(fscal,dy00);
338             tz               = _mm_mul_ps(fscal,dz00);
339
340             /* Update vectorial force */
341             fix0             = _mm_add_ps(fix0,tx);
342             fiy0             = _mm_add_ps(fiy0,ty);
343             fiz0             = _mm_add_ps(fiz0,tz);
344
345             fjx0             = _mm_add_ps(fjx0,tx);
346             fjy0             = _mm_add_ps(fjy0,ty);
347             fjz0             = _mm_add_ps(fjz0,tz);
348             
349             }
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             if (gmx_mm_any_lt(rsq01,rcutoff2))
356             {
357
358             /* REACTION-FIELD ELECTROSTATICS */
359             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
360             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
361
362             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
363
364             /* Update potential sum for this i atom from the interaction with this j atom. */
365             velec            = _mm_and_ps(velec,cutoff_mask);
366             velecsum         = _mm_add_ps(velecsum,velec);
367
368             fscal            = felec;
369
370             fscal            = _mm_and_ps(fscal,cutoff_mask);
371
372             /* Calculate temporary vectorial force */
373             tx               = _mm_mul_ps(fscal,dx01);
374             ty               = _mm_mul_ps(fscal,dy01);
375             tz               = _mm_mul_ps(fscal,dz01);
376
377             /* Update vectorial force */
378             fix0             = _mm_add_ps(fix0,tx);
379             fiy0             = _mm_add_ps(fiy0,ty);
380             fiz0             = _mm_add_ps(fiz0,tz);
381
382             fjx1             = _mm_add_ps(fjx1,tx);
383             fjy1             = _mm_add_ps(fjy1,ty);
384             fjz1             = _mm_add_ps(fjz1,tz);
385             
386             }
387
388             /**************************
389              * CALCULATE INTERACTIONS *
390              **************************/
391
392             if (gmx_mm_any_lt(rsq02,rcutoff2))
393             {
394
395             /* REACTION-FIELD ELECTROSTATICS */
396             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
397             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
398
399             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
400
401             /* Update potential sum for this i atom from the interaction with this j atom. */
402             velec            = _mm_and_ps(velec,cutoff_mask);
403             velecsum         = _mm_add_ps(velecsum,velec);
404
405             fscal            = felec;
406
407             fscal            = _mm_and_ps(fscal,cutoff_mask);
408
409             /* Calculate temporary vectorial force */
410             tx               = _mm_mul_ps(fscal,dx02);
411             ty               = _mm_mul_ps(fscal,dy02);
412             tz               = _mm_mul_ps(fscal,dz02);
413
414             /* Update vectorial force */
415             fix0             = _mm_add_ps(fix0,tx);
416             fiy0             = _mm_add_ps(fiy0,ty);
417             fiz0             = _mm_add_ps(fiz0,tz);
418
419             fjx2             = _mm_add_ps(fjx2,tx);
420             fjy2             = _mm_add_ps(fjy2,ty);
421             fjz2             = _mm_add_ps(fjz2,tz);
422             
423             }
424
425             /**************************
426              * CALCULATE INTERACTIONS *
427              **************************/
428
429             if (gmx_mm_any_lt(rsq10,rcutoff2))
430             {
431
432             /* REACTION-FIELD ELECTROSTATICS */
433             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
434             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
435
436             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
437
438             /* Update potential sum for this i atom from the interaction with this j atom. */
439             velec            = _mm_and_ps(velec,cutoff_mask);
440             velecsum         = _mm_add_ps(velecsum,velec);
441
442             fscal            = felec;
443
444             fscal            = _mm_and_ps(fscal,cutoff_mask);
445
446             /* Calculate temporary vectorial force */
447             tx               = _mm_mul_ps(fscal,dx10);
448             ty               = _mm_mul_ps(fscal,dy10);
449             tz               = _mm_mul_ps(fscal,dz10);
450
451             /* Update vectorial force */
452             fix1             = _mm_add_ps(fix1,tx);
453             fiy1             = _mm_add_ps(fiy1,ty);
454             fiz1             = _mm_add_ps(fiz1,tz);
455
456             fjx0             = _mm_add_ps(fjx0,tx);
457             fjy0             = _mm_add_ps(fjy0,ty);
458             fjz0             = _mm_add_ps(fjz0,tz);
459             
460             }
461
462             /**************************
463              * CALCULATE INTERACTIONS *
464              **************************/
465
466             if (gmx_mm_any_lt(rsq11,rcutoff2))
467             {
468
469             /* REACTION-FIELD ELECTROSTATICS */
470             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
471             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
472
473             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
474
475             /* Update potential sum for this i atom from the interaction with this j atom. */
476             velec            = _mm_and_ps(velec,cutoff_mask);
477             velecsum         = _mm_add_ps(velecsum,velec);
478
479             fscal            = felec;
480
481             fscal            = _mm_and_ps(fscal,cutoff_mask);
482
483             /* Calculate temporary vectorial force */
484             tx               = _mm_mul_ps(fscal,dx11);
485             ty               = _mm_mul_ps(fscal,dy11);
486             tz               = _mm_mul_ps(fscal,dz11);
487
488             /* Update vectorial force */
489             fix1             = _mm_add_ps(fix1,tx);
490             fiy1             = _mm_add_ps(fiy1,ty);
491             fiz1             = _mm_add_ps(fiz1,tz);
492
493             fjx1             = _mm_add_ps(fjx1,tx);
494             fjy1             = _mm_add_ps(fjy1,ty);
495             fjz1             = _mm_add_ps(fjz1,tz);
496             
497             }
498
499             /**************************
500              * CALCULATE INTERACTIONS *
501              **************************/
502
503             if (gmx_mm_any_lt(rsq12,rcutoff2))
504             {
505
506             /* REACTION-FIELD ELECTROSTATICS */
507             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
508             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
509
510             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
511
512             /* Update potential sum for this i atom from the interaction with this j atom. */
513             velec            = _mm_and_ps(velec,cutoff_mask);
514             velecsum         = _mm_add_ps(velecsum,velec);
515
516             fscal            = felec;
517
518             fscal            = _mm_and_ps(fscal,cutoff_mask);
519
520             /* Calculate temporary vectorial force */
521             tx               = _mm_mul_ps(fscal,dx12);
522             ty               = _mm_mul_ps(fscal,dy12);
523             tz               = _mm_mul_ps(fscal,dz12);
524
525             /* Update vectorial force */
526             fix1             = _mm_add_ps(fix1,tx);
527             fiy1             = _mm_add_ps(fiy1,ty);
528             fiz1             = _mm_add_ps(fiz1,tz);
529
530             fjx2             = _mm_add_ps(fjx2,tx);
531             fjy2             = _mm_add_ps(fjy2,ty);
532             fjz2             = _mm_add_ps(fjz2,tz);
533             
534             }
535
536             /**************************
537              * CALCULATE INTERACTIONS *
538              **************************/
539
540             if (gmx_mm_any_lt(rsq20,rcutoff2))
541             {
542
543             /* REACTION-FIELD ELECTROSTATICS */
544             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
545             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
546
547             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
548
549             /* Update potential sum for this i atom from the interaction with this j atom. */
550             velec            = _mm_and_ps(velec,cutoff_mask);
551             velecsum         = _mm_add_ps(velecsum,velec);
552
553             fscal            = felec;
554
555             fscal            = _mm_and_ps(fscal,cutoff_mask);
556
557             /* Calculate temporary vectorial force */
558             tx               = _mm_mul_ps(fscal,dx20);
559             ty               = _mm_mul_ps(fscal,dy20);
560             tz               = _mm_mul_ps(fscal,dz20);
561
562             /* Update vectorial force */
563             fix2             = _mm_add_ps(fix2,tx);
564             fiy2             = _mm_add_ps(fiy2,ty);
565             fiz2             = _mm_add_ps(fiz2,tz);
566
567             fjx0             = _mm_add_ps(fjx0,tx);
568             fjy0             = _mm_add_ps(fjy0,ty);
569             fjz0             = _mm_add_ps(fjz0,tz);
570             
571             }
572
573             /**************************
574              * CALCULATE INTERACTIONS *
575              **************************/
576
577             if (gmx_mm_any_lt(rsq21,rcutoff2))
578             {
579
580             /* REACTION-FIELD ELECTROSTATICS */
581             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
582             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
583
584             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
585
586             /* Update potential sum for this i atom from the interaction with this j atom. */
587             velec            = _mm_and_ps(velec,cutoff_mask);
588             velecsum         = _mm_add_ps(velecsum,velec);
589
590             fscal            = felec;
591
592             fscal            = _mm_and_ps(fscal,cutoff_mask);
593
594             /* Calculate temporary vectorial force */
595             tx               = _mm_mul_ps(fscal,dx21);
596             ty               = _mm_mul_ps(fscal,dy21);
597             tz               = _mm_mul_ps(fscal,dz21);
598
599             /* Update vectorial force */
600             fix2             = _mm_add_ps(fix2,tx);
601             fiy2             = _mm_add_ps(fiy2,ty);
602             fiz2             = _mm_add_ps(fiz2,tz);
603
604             fjx1             = _mm_add_ps(fjx1,tx);
605             fjy1             = _mm_add_ps(fjy1,ty);
606             fjz1             = _mm_add_ps(fjz1,tz);
607             
608             }
609
610             /**************************
611              * CALCULATE INTERACTIONS *
612              **************************/
613
614             if (gmx_mm_any_lt(rsq22,rcutoff2))
615             {
616
617             /* REACTION-FIELD ELECTROSTATICS */
618             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
619             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
620
621             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
622
623             /* Update potential sum for this i atom from the interaction with this j atom. */
624             velec            = _mm_and_ps(velec,cutoff_mask);
625             velecsum         = _mm_add_ps(velecsum,velec);
626
627             fscal            = felec;
628
629             fscal            = _mm_and_ps(fscal,cutoff_mask);
630
631             /* Calculate temporary vectorial force */
632             tx               = _mm_mul_ps(fscal,dx22);
633             ty               = _mm_mul_ps(fscal,dy22);
634             tz               = _mm_mul_ps(fscal,dz22);
635
636             /* Update vectorial force */
637             fix2             = _mm_add_ps(fix2,tx);
638             fiy2             = _mm_add_ps(fiy2,ty);
639             fiz2             = _mm_add_ps(fiz2,tz);
640
641             fjx2             = _mm_add_ps(fjx2,tx);
642             fjy2             = _mm_add_ps(fjy2,ty);
643             fjz2             = _mm_add_ps(fjz2,tz);
644             
645             }
646
647             fjptrA             = f+j_coord_offsetA;
648             fjptrB             = f+j_coord_offsetB;
649             fjptrC             = f+j_coord_offsetC;
650             fjptrD             = f+j_coord_offsetD;
651
652             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
653                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
654
655             /* Inner loop uses 342 flops */
656         }
657
658         if(jidx<j_index_end)
659         {
660
661             /* Get j neighbor index, and coordinate index */
662             jnrlistA         = jjnr[jidx];
663             jnrlistB         = jjnr[jidx+1];
664             jnrlistC         = jjnr[jidx+2];
665             jnrlistD         = jjnr[jidx+3];
666             /* Sign of each element will be negative for non-real atoms.
667              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
668              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
669              */
670             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
671             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
672             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
673             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
674             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
675             j_coord_offsetA  = DIM*jnrA;
676             j_coord_offsetB  = DIM*jnrB;
677             j_coord_offsetC  = DIM*jnrC;
678             j_coord_offsetD  = DIM*jnrD;
679
680             /* load j atom coordinates */
681             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
682                                               x+j_coord_offsetC,x+j_coord_offsetD,
683                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
684
685             /* Calculate displacement vector */
686             dx00             = _mm_sub_ps(ix0,jx0);
687             dy00             = _mm_sub_ps(iy0,jy0);
688             dz00             = _mm_sub_ps(iz0,jz0);
689             dx01             = _mm_sub_ps(ix0,jx1);
690             dy01             = _mm_sub_ps(iy0,jy1);
691             dz01             = _mm_sub_ps(iz0,jz1);
692             dx02             = _mm_sub_ps(ix0,jx2);
693             dy02             = _mm_sub_ps(iy0,jy2);
694             dz02             = _mm_sub_ps(iz0,jz2);
695             dx10             = _mm_sub_ps(ix1,jx0);
696             dy10             = _mm_sub_ps(iy1,jy0);
697             dz10             = _mm_sub_ps(iz1,jz0);
698             dx11             = _mm_sub_ps(ix1,jx1);
699             dy11             = _mm_sub_ps(iy1,jy1);
700             dz11             = _mm_sub_ps(iz1,jz1);
701             dx12             = _mm_sub_ps(ix1,jx2);
702             dy12             = _mm_sub_ps(iy1,jy2);
703             dz12             = _mm_sub_ps(iz1,jz2);
704             dx20             = _mm_sub_ps(ix2,jx0);
705             dy20             = _mm_sub_ps(iy2,jy0);
706             dz20             = _mm_sub_ps(iz2,jz0);
707             dx21             = _mm_sub_ps(ix2,jx1);
708             dy21             = _mm_sub_ps(iy2,jy1);
709             dz21             = _mm_sub_ps(iz2,jz1);
710             dx22             = _mm_sub_ps(ix2,jx2);
711             dy22             = _mm_sub_ps(iy2,jy2);
712             dz22             = _mm_sub_ps(iz2,jz2);
713
714             /* Calculate squared distance and things based on it */
715             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
716             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
717             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
718             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
719             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
720             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
721             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
722             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
723             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
724
725             rinv00           = gmx_mm_invsqrt_ps(rsq00);
726             rinv01           = gmx_mm_invsqrt_ps(rsq01);
727             rinv02           = gmx_mm_invsqrt_ps(rsq02);
728             rinv10           = gmx_mm_invsqrt_ps(rsq10);
729             rinv11           = gmx_mm_invsqrt_ps(rsq11);
730             rinv12           = gmx_mm_invsqrt_ps(rsq12);
731             rinv20           = gmx_mm_invsqrt_ps(rsq20);
732             rinv21           = gmx_mm_invsqrt_ps(rsq21);
733             rinv22           = gmx_mm_invsqrt_ps(rsq22);
734
735             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
736             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
737             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
738             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
739             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
740             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
741             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
742             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
743             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
744
745             fjx0             = _mm_setzero_ps();
746             fjy0             = _mm_setzero_ps();
747             fjz0             = _mm_setzero_ps();
748             fjx1             = _mm_setzero_ps();
749             fjy1             = _mm_setzero_ps();
750             fjz1             = _mm_setzero_ps();
751             fjx2             = _mm_setzero_ps();
752             fjy2             = _mm_setzero_ps();
753             fjz2             = _mm_setzero_ps();
754
755             /**************************
756              * CALCULATE INTERACTIONS *
757              **************************/
758
759             if (gmx_mm_any_lt(rsq00,rcutoff2))
760             {
761
762             /* REACTION-FIELD ELECTROSTATICS */
763             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
764             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
765
766             /* LENNARD-JONES DISPERSION/REPULSION */
767
768             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
769             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
770             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
771             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
772                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
773             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
774
775             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
776
777             /* Update potential sum for this i atom from the interaction with this j atom. */
778             velec            = _mm_and_ps(velec,cutoff_mask);
779             velec            = _mm_andnot_ps(dummy_mask,velec);
780             velecsum         = _mm_add_ps(velecsum,velec);
781             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
782             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
783             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
784
785             fscal            = _mm_add_ps(felec,fvdw);
786
787             fscal            = _mm_and_ps(fscal,cutoff_mask);
788
789             fscal            = _mm_andnot_ps(dummy_mask,fscal);
790
791             /* Calculate temporary vectorial force */
792             tx               = _mm_mul_ps(fscal,dx00);
793             ty               = _mm_mul_ps(fscal,dy00);
794             tz               = _mm_mul_ps(fscal,dz00);
795
796             /* Update vectorial force */
797             fix0             = _mm_add_ps(fix0,tx);
798             fiy0             = _mm_add_ps(fiy0,ty);
799             fiz0             = _mm_add_ps(fiz0,tz);
800
801             fjx0             = _mm_add_ps(fjx0,tx);
802             fjy0             = _mm_add_ps(fjy0,ty);
803             fjz0             = _mm_add_ps(fjz0,tz);
804             
805             }
806
807             /**************************
808              * CALCULATE INTERACTIONS *
809              **************************/
810
811             if (gmx_mm_any_lt(rsq01,rcutoff2))
812             {
813
814             /* REACTION-FIELD ELECTROSTATICS */
815             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
816             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
817
818             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
819
820             /* Update potential sum for this i atom from the interaction with this j atom. */
821             velec            = _mm_and_ps(velec,cutoff_mask);
822             velec            = _mm_andnot_ps(dummy_mask,velec);
823             velecsum         = _mm_add_ps(velecsum,velec);
824
825             fscal            = felec;
826
827             fscal            = _mm_and_ps(fscal,cutoff_mask);
828
829             fscal            = _mm_andnot_ps(dummy_mask,fscal);
830
831             /* Calculate temporary vectorial force */
832             tx               = _mm_mul_ps(fscal,dx01);
833             ty               = _mm_mul_ps(fscal,dy01);
834             tz               = _mm_mul_ps(fscal,dz01);
835
836             /* Update vectorial force */
837             fix0             = _mm_add_ps(fix0,tx);
838             fiy0             = _mm_add_ps(fiy0,ty);
839             fiz0             = _mm_add_ps(fiz0,tz);
840
841             fjx1             = _mm_add_ps(fjx1,tx);
842             fjy1             = _mm_add_ps(fjy1,ty);
843             fjz1             = _mm_add_ps(fjz1,tz);
844             
845             }
846
847             /**************************
848              * CALCULATE INTERACTIONS *
849              **************************/
850
851             if (gmx_mm_any_lt(rsq02,rcutoff2))
852             {
853
854             /* REACTION-FIELD ELECTROSTATICS */
855             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
856             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
857
858             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
859
860             /* Update potential sum for this i atom from the interaction with this j atom. */
861             velec            = _mm_and_ps(velec,cutoff_mask);
862             velec            = _mm_andnot_ps(dummy_mask,velec);
863             velecsum         = _mm_add_ps(velecsum,velec);
864
865             fscal            = felec;
866
867             fscal            = _mm_and_ps(fscal,cutoff_mask);
868
869             fscal            = _mm_andnot_ps(dummy_mask,fscal);
870
871             /* Calculate temporary vectorial force */
872             tx               = _mm_mul_ps(fscal,dx02);
873             ty               = _mm_mul_ps(fscal,dy02);
874             tz               = _mm_mul_ps(fscal,dz02);
875
876             /* Update vectorial force */
877             fix0             = _mm_add_ps(fix0,tx);
878             fiy0             = _mm_add_ps(fiy0,ty);
879             fiz0             = _mm_add_ps(fiz0,tz);
880
881             fjx2             = _mm_add_ps(fjx2,tx);
882             fjy2             = _mm_add_ps(fjy2,ty);
883             fjz2             = _mm_add_ps(fjz2,tz);
884             
885             }
886
887             /**************************
888              * CALCULATE INTERACTIONS *
889              **************************/
890
891             if (gmx_mm_any_lt(rsq10,rcutoff2))
892             {
893
894             /* REACTION-FIELD ELECTROSTATICS */
895             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
896             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
897
898             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
899
900             /* Update potential sum for this i atom from the interaction with this j atom. */
901             velec            = _mm_and_ps(velec,cutoff_mask);
902             velec            = _mm_andnot_ps(dummy_mask,velec);
903             velecsum         = _mm_add_ps(velecsum,velec);
904
905             fscal            = felec;
906
907             fscal            = _mm_and_ps(fscal,cutoff_mask);
908
909             fscal            = _mm_andnot_ps(dummy_mask,fscal);
910
911             /* Calculate temporary vectorial force */
912             tx               = _mm_mul_ps(fscal,dx10);
913             ty               = _mm_mul_ps(fscal,dy10);
914             tz               = _mm_mul_ps(fscal,dz10);
915
916             /* Update vectorial force */
917             fix1             = _mm_add_ps(fix1,tx);
918             fiy1             = _mm_add_ps(fiy1,ty);
919             fiz1             = _mm_add_ps(fiz1,tz);
920
921             fjx0             = _mm_add_ps(fjx0,tx);
922             fjy0             = _mm_add_ps(fjy0,ty);
923             fjz0             = _mm_add_ps(fjz0,tz);
924             
925             }
926
927             /**************************
928              * CALCULATE INTERACTIONS *
929              **************************/
930
931             if (gmx_mm_any_lt(rsq11,rcutoff2))
932             {
933
934             /* REACTION-FIELD ELECTROSTATICS */
935             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
936             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
937
938             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
939
940             /* Update potential sum for this i atom from the interaction with this j atom. */
941             velec            = _mm_and_ps(velec,cutoff_mask);
942             velec            = _mm_andnot_ps(dummy_mask,velec);
943             velecsum         = _mm_add_ps(velecsum,velec);
944
945             fscal            = felec;
946
947             fscal            = _mm_and_ps(fscal,cutoff_mask);
948
949             fscal            = _mm_andnot_ps(dummy_mask,fscal);
950
951             /* Calculate temporary vectorial force */
952             tx               = _mm_mul_ps(fscal,dx11);
953             ty               = _mm_mul_ps(fscal,dy11);
954             tz               = _mm_mul_ps(fscal,dz11);
955
956             /* Update vectorial force */
957             fix1             = _mm_add_ps(fix1,tx);
958             fiy1             = _mm_add_ps(fiy1,ty);
959             fiz1             = _mm_add_ps(fiz1,tz);
960
961             fjx1             = _mm_add_ps(fjx1,tx);
962             fjy1             = _mm_add_ps(fjy1,ty);
963             fjz1             = _mm_add_ps(fjz1,tz);
964             
965             }
966
967             /**************************
968              * CALCULATE INTERACTIONS *
969              **************************/
970
971             if (gmx_mm_any_lt(rsq12,rcutoff2))
972             {
973
974             /* REACTION-FIELD ELECTROSTATICS */
975             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
976             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
977
978             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
979
980             /* Update potential sum for this i atom from the interaction with this j atom. */
981             velec            = _mm_and_ps(velec,cutoff_mask);
982             velec            = _mm_andnot_ps(dummy_mask,velec);
983             velecsum         = _mm_add_ps(velecsum,velec);
984
985             fscal            = felec;
986
987             fscal            = _mm_and_ps(fscal,cutoff_mask);
988
989             fscal            = _mm_andnot_ps(dummy_mask,fscal);
990
991             /* Calculate temporary vectorial force */
992             tx               = _mm_mul_ps(fscal,dx12);
993             ty               = _mm_mul_ps(fscal,dy12);
994             tz               = _mm_mul_ps(fscal,dz12);
995
996             /* Update vectorial force */
997             fix1             = _mm_add_ps(fix1,tx);
998             fiy1             = _mm_add_ps(fiy1,ty);
999             fiz1             = _mm_add_ps(fiz1,tz);
1000
1001             fjx2             = _mm_add_ps(fjx2,tx);
1002             fjy2             = _mm_add_ps(fjy2,ty);
1003             fjz2             = _mm_add_ps(fjz2,tz);
1004             
1005             }
1006
1007             /**************************
1008              * CALCULATE INTERACTIONS *
1009              **************************/
1010
1011             if (gmx_mm_any_lt(rsq20,rcutoff2))
1012             {
1013
1014             /* REACTION-FIELD ELECTROSTATICS */
1015             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1016             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1017
1018             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1019
1020             /* Update potential sum for this i atom from the interaction with this j atom. */
1021             velec            = _mm_and_ps(velec,cutoff_mask);
1022             velec            = _mm_andnot_ps(dummy_mask,velec);
1023             velecsum         = _mm_add_ps(velecsum,velec);
1024
1025             fscal            = felec;
1026
1027             fscal            = _mm_and_ps(fscal,cutoff_mask);
1028
1029             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1030
1031             /* Calculate temporary vectorial force */
1032             tx               = _mm_mul_ps(fscal,dx20);
1033             ty               = _mm_mul_ps(fscal,dy20);
1034             tz               = _mm_mul_ps(fscal,dz20);
1035
1036             /* Update vectorial force */
1037             fix2             = _mm_add_ps(fix2,tx);
1038             fiy2             = _mm_add_ps(fiy2,ty);
1039             fiz2             = _mm_add_ps(fiz2,tz);
1040
1041             fjx0             = _mm_add_ps(fjx0,tx);
1042             fjy0             = _mm_add_ps(fjy0,ty);
1043             fjz0             = _mm_add_ps(fjz0,tz);
1044             
1045             }
1046
1047             /**************************
1048              * CALCULATE INTERACTIONS *
1049              **************************/
1050
1051             if (gmx_mm_any_lt(rsq21,rcutoff2))
1052             {
1053
1054             /* REACTION-FIELD ELECTROSTATICS */
1055             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1056             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1057
1058             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1059
1060             /* Update potential sum for this i atom from the interaction with this j atom. */
1061             velec            = _mm_and_ps(velec,cutoff_mask);
1062             velec            = _mm_andnot_ps(dummy_mask,velec);
1063             velecsum         = _mm_add_ps(velecsum,velec);
1064
1065             fscal            = felec;
1066
1067             fscal            = _mm_and_ps(fscal,cutoff_mask);
1068
1069             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1070
1071             /* Calculate temporary vectorial force */
1072             tx               = _mm_mul_ps(fscal,dx21);
1073             ty               = _mm_mul_ps(fscal,dy21);
1074             tz               = _mm_mul_ps(fscal,dz21);
1075
1076             /* Update vectorial force */
1077             fix2             = _mm_add_ps(fix2,tx);
1078             fiy2             = _mm_add_ps(fiy2,ty);
1079             fiz2             = _mm_add_ps(fiz2,tz);
1080
1081             fjx1             = _mm_add_ps(fjx1,tx);
1082             fjy1             = _mm_add_ps(fjy1,ty);
1083             fjz1             = _mm_add_ps(fjz1,tz);
1084             
1085             }
1086
1087             /**************************
1088              * CALCULATE INTERACTIONS *
1089              **************************/
1090
1091             if (gmx_mm_any_lt(rsq22,rcutoff2))
1092             {
1093
1094             /* REACTION-FIELD ELECTROSTATICS */
1095             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1096             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1097
1098             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1099
1100             /* Update potential sum for this i atom from the interaction with this j atom. */
1101             velec            = _mm_and_ps(velec,cutoff_mask);
1102             velec            = _mm_andnot_ps(dummy_mask,velec);
1103             velecsum         = _mm_add_ps(velecsum,velec);
1104
1105             fscal            = felec;
1106
1107             fscal            = _mm_and_ps(fscal,cutoff_mask);
1108
1109             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1110
1111             /* Calculate temporary vectorial force */
1112             tx               = _mm_mul_ps(fscal,dx22);
1113             ty               = _mm_mul_ps(fscal,dy22);
1114             tz               = _mm_mul_ps(fscal,dz22);
1115
1116             /* Update vectorial force */
1117             fix2             = _mm_add_ps(fix2,tx);
1118             fiy2             = _mm_add_ps(fiy2,ty);
1119             fiz2             = _mm_add_ps(fiz2,tz);
1120
1121             fjx2             = _mm_add_ps(fjx2,tx);
1122             fjy2             = _mm_add_ps(fjy2,ty);
1123             fjz2             = _mm_add_ps(fjz2,tz);
1124             
1125             }
1126
1127             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1128             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1129             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1130             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1131
1132             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1133                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1134
1135             /* Inner loop uses 342 flops */
1136         }
1137
1138         /* End of innermost loop */
1139
1140         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1141                                               f+i_coord_offset,fshift+i_shift_offset);
1142
1143         ggid                        = gid[iidx];
1144         /* Update potential energies */
1145         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1146         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1147
1148         /* Increment number of inner iterations */
1149         inneriter                  += j_index_end - j_index_start;
1150
1151         /* Outer loop uses 20 flops */
1152     }
1153
1154     /* Increment number of outer iterations */
1155     outeriter        += nri;
1156
1157     /* Update outer/inner flops */
1158
1159     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
1160 }
1161 /*
1162  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
1163  * Electrostatics interaction: ReactionField
1164  * VdW interaction:            LennardJones
1165  * Geometry:                   Water3-Water3
1166  * Calculate force/pot:        Force
1167  */
1168 void
1169 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
1170                     (t_nblist                    * gmx_restrict       nlist,
1171                      rvec                        * gmx_restrict          xx,
1172                      rvec                        * gmx_restrict          ff,
1173                      t_forcerec                  * gmx_restrict          fr,
1174                      t_mdatoms                   * gmx_restrict     mdatoms,
1175                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1176                      t_nrnb                      * gmx_restrict        nrnb)
1177 {
1178     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1179      * just 0 for non-waters.
1180      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1181      * jnr indices corresponding to data put in the four positions in the SIMD register.
1182      */
1183     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1184     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1185     int              jnrA,jnrB,jnrC,jnrD;
1186     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1187     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1188     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1189     real             rcutoff_scalar;
1190     real             *shiftvec,*fshift,*x,*f;
1191     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1192     real             scratch[4*DIM];
1193     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1194     int              vdwioffset0;
1195     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1196     int              vdwioffset1;
1197     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1198     int              vdwioffset2;
1199     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1200     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1201     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1202     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1203     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1204     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1205     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1206     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1207     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1208     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1209     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1210     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1211     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1212     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1213     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1214     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1215     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1216     real             *charge;
1217     int              nvdwtype;
1218     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1219     int              *vdwtype;
1220     real             *vdwparam;
1221     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1222     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1223     __m128           dummy_mask,cutoff_mask;
1224     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1225     __m128           one     = _mm_set1_ps(1.0);
1226     __m128           two     = _mm_set1_ps(2.0);
1227     x                = xx[0];
1228     f                = ff[0];
1229
1230     nri              = nlist->nri;
1231     iinr             = nlist->iinr;
1232     jindex           = nlist->jindex;
1233     jjnr             = nlist->jjnr;
1234     shiftidx         = nlist->shift;
1235     gid              = nlist->gid;
1236     shiftvec         = fr->shift_vec[0];
1237     fshift           = fr->fshift[0];
1238     facel            = _mm_set1_ps(fr->epsfac);
1239     charge           = mdatoms->chargeA;
1240     krf              = _mm_set1_ps(fr->ic->k_rf);
1241     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1242     crf              = _mm_set1_ps(fr->ic->c_rf);
1243     nvdwtype         = fr->ntype;
1244     vdwparam         = fr->nbfp;
1245     vdwtype          = mdatoms->typeA;
1246
1247     /* Setup water-specific parameters */
1248     inr              = nlist->iinr[0];
1249     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1250     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1251     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1252     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1253
1254     jq0              = _mm_set1_ps(charge[inr+0]);
1255     jq1              = _mm_set1_ps(charge[inr+1]);
1256     jq2              = _mm_set1_ps(charge[inr+2]);
1257     vdwjidx0A        = 2*vdwtype[inr+0];
1258     qq00             = _mm_mul_ps(iq0,jq0);
1259     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1260     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1261     qq01             = _mm_mul_ps(iq0,jq1);
1262     qq02             = _mm_mul_ps(iq0,jq2);
1263     qq10             = _mm_mul_ps(iq1,jq0);
1264     qq11             = _mm_mul_ps(iq1,jq1);
1265     qq12             = _mm_mul_ps(iq1,jq2);
1266     qq20             = _mm_mul_ps(iq2,jq0);
1267     qq21             = _mm_mul_ps(iq2,jq1);
1268     qq22             = _mm_mul_ps(iq2,jq2);
1269
1270     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1271     rcutoff_scalar   = fr->rcoulomb;
1272     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1273     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1274
1275     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1276     rvdw             = _mm_set1_ps(fr->rvdw);
1277
1278     /* Avoid stupid compiler warnings */
1279     jnrA = jnrB = jnrC = jnrD = 0;
1280     j_coord_offsetA = 0;
1281     j_coord_offsetB = 0;
1282     j_coord_offsetC = 0;
1283     j_coord_offsetD = 0;
1284
1285     outeriter        = 0;
1286     inneriter        = 0;
1287
1288     for(iidx=0;iidx<4*DIM;iidx++)
1289     {
1290         scratch[iidx] = 0.0;
1291     }  
1292
1293     /* Start outer loop over neighborlists */
1294     for(iidx=0; iidx<nri; iidx++)
1295     {
1296         /* Load shift vector for this list */
1297         i_shift_offset   = DIM*shiftidx[iidx];
1298
1299         /* Load limits for loop over neighbors */
1300         j_index_start    = jindex[iidx];
1301         j_index_end      = jindex[iidx+1];
1302
1303         /* Get outer coordinate index */
1304         inr              = iinr[iidx];
1305         i_coord_offset   = DIM*inr;
1306
1307         /* Load i particle coords and add shift vector */
1308         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1309                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1310         
1311         fix0             = _mm_setzero_ps();
1312         fiy0             = _mm_setzero_ps();
1313         fiz0             = _mm_setzero_ps();
1314         fix1             = _mm_setzero_ps();
1315         fiy1             = _mm_setzero_ps();
1316         fiz1             = _mm_setzero_ps();
1317         fix2             = _mm_setzero_ps();
1318         fiy2             = _mm_setzero_ps();
1319         fiz2             = _mm_setzero_ps();
1320
1321         /* Start inner kernel loop */
1322         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1323         {
1324
1325             /* Get j neighbor index, and coordinate index */
1326             jnrA             = jjnr[jidx];
1327             jnrB             = jjnr[jidx+1];
1328             jnrC             = jjnr[jidx+2];
1329             jnrD             = jjnr[jidx+3];
1330             j_coord_offsetA  = DIM*jnrA;
1331             j_coord_offsetB  = DIM*jnrB;
1332             j_coord_offsetC  = DIM*jnrC;
1333             j_coord_offsetD  = DIM*jnrD;
1334
1335             /* load j atom coordinates */
1336             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1337                                               x+j_coord_offsetC,x+j_coord_offsetD,
1338                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1339
1340             /* Calculate displacement vector */
1341             dx00             = _mm_sub_ps(ix0,jx0);
1342             dy00             = _mm_sub_ps(iy0,jy0);
1343             dz00             = _mm_sub_ps(iz0,jz0);
1344             dx01             = _mm_sub_ps(ix0,jx1);
1345             dy01             = _mm_sub_ps(iy0,jy1);
1346             dz01             = _mm_sub_ps(iz0,jz1);
1347             dx02             = _mm_sub_ps(ix0,jx2);
1348             dy02             = _mm_sub_ps(iy0,jy2);
1349             dz02             = _mm_sub_ps(iz0,jz2);
1350             dx10             = _mm_sub_ps(ix1,jx0);
1351             dy10             = _mm_sub_ps(iy1,jy0);
1352             dz10             = _mm_sub_ps(iz1,jz0);
1353             dx11             = _mm_sub_ps(ix1,jx1);
1354             dy11             = _mm_sub_ps(iy1,jy1);
1355             dz11             = _mm_sub_ps(iz1,jz1);
1356             dx12             = _mm_sub_ps(ix1,jx2);
1357             dy12             = _mm_sub_ps(iy1,jy2);
1358             dz12             = _mm_sub_ps(iz1,jz2);
1359             dx20             = _mm_sub_ps(ix2,jx0);
1360             dy20             = _mm_sub_ps(iy2,jy0);
1361             dz20             = _mm_sub_ps(iz2,jz0);
1362             dx21             = _mm_sub_ps(ix2,jx1);
1363             dy21             = _mm_sub_ps(iy2,jy1);
1364             dz21             = _mm_sub_ps(iz2,jz1);
1365             dx22             = _mm_sub_ps(ix2,jx2);
1366             dy22             = _mm_sub_ps(iy2,jy2);
1367             dz22             = _mm_sub_ps(iz2,jz2);
1368
1369             /* Calculate squared distance and things based on it */
1370             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1371             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1372             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1373             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1374             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1375             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1376             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1377             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1378             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1379
1380             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1381             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1382             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1383             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1384             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1385             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1386             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1387             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1388             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1389
1390             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1391             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1392             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1393             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1394             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1395             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1396             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1397             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1398             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1399
1400             fjx0             = _mm_setzero_ps();
1401             fjy0             = _mm_setzero_ps();
1402             fjz0             = _mm_setzero_ps();
1403             fjx1             = _mm_setzero_ps();
1404             fjy1             = _mm_setzero_ps();
1405             fjz1             = _mm_setzero_ps();
1406             fjx2             = _mm_setzero_ps();
1407             fjy2             = _mm_setzero_ps();
1408             fjz2             = _mm_setzero_ps();
1409
1410             /**************************
1411              * CALCULATE INTERACTIONS *
1412              **************************/
1413
1414             if (gmx_mm_any_lt(rsq00,rcutoff2))
1415             {
1416
1417             /* REACTION-FIELD ELECTROSTATICS */
1418             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1419
1420             /* LENNARD-JONES DISPERSION/REPULSION */
1421
1422             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1423             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1424
1425             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1426
1427             fscal            = _mm_add_ps(felec,fvdw);
1428
1429             fscal            = _mm_and_ps(fscal,cutoff_mask);
1430
1431             /* Calculate temporary vectorial force */
1432             tx               = _mm_mul_ps(fscal,dx00);
1433             ty               = _mm_mul_ps(fscal,dy00);
1434             tz               = _mm_mul_ps(fscal,dz00);
1435
1436             /* Update vectorial force */
1437             fix0             = _mm_add_ps(fix0,tx);
1438             fiy0             = _mm_add_ps(fiy0,ty);
1439             fiz0             = _mm_add_ps(fiz0,tz);
1440
1441             fjx0             = _mm_add_ps(fjx0,tx);
1442             fjy0             = _mm_add_ps(fjy0,ty);
1443             fjz0             = _mm_add_ps(fjz0,tz);
1444             
1445             }
1446
1447             /**************************
1448              * CALCULATE INTERACTIONS *
1449              **************************/
1450
1451             if (gmx_mm_any_lt(rsq01,rcutoff2))
1452             {
1453
1454             /* REACTION-FIELD ELECTROSTATICS */
1455             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1456
1457             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1458
1459             fscal            = felec;
1460
1461             fscal            = _mm_and_ps(fscal,cutoff_mask);
1462
1463             /* Calculate temporary vectorial force */
1464             tx               = _mm_mul_ps(fscal,dx01);
1465             ty               = _mm_mul_ps(fscal,dy01);
1466             tz               = _mm_mul_ps(fscal,dz01);
1467
1468             /* Update vectorial force */
1469             fix0             = _mm_add_ps(fix0,tx);
1470             fiy0             = _mm_add_ps(fiy0,ty);
1471             fiz0             = _mm_add_ps(fiz0,tz);
1472
1473             fjx1             = _mm_add_ps(fjx1,tx);
1474             fjy1             = _mm_add_ps(fjy1,ty);
1475             fjz1             = _mm_add_ps(fjz1,tz);
1476             
1477             }
1478
1479             /**************************
1480              * CALCULATE INTERACTIONS *
1481              **************************/
1482
1483             if (gmx_mm_any_lt(rsq02,rcutoff2))
1484             {
1485
1486             /* REACTION-FIELD ELECTROSTATICS */
1487             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1488
1489             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1490
1491             fscal            = felec;
1492
1493             fscal            = _mm_and_ps(fscal,cutoff_mask);
1494
1495             /* Calculate temporary vectorial force */
1496             tx               = _mm_mul_ps(fscal,dx02);
1497             ty               = _mm_mul_ps(fscal,dy02);
1498             tz               = _mm_mul_ps(fscal,dz02);
1499
1500             /* Update vectorial force */
1501             fix0             = _mm_add_ps(fix0,tx);
1502             fiy0             = _mm_add_ps(fiy0,ty);
1503             fiz0             = _mm_add_ps(fiz0,tz);
1504
1505             fjx2             = _mm_add_ps(fjx2,tx);
1506             fjy2             = _mm_add_ps(fjy2,ty);
1507             fjz2             = _mm_add_ps(fjz2,tz);
1508             
1509             }
1510
1511             /**************************
1512              * CALCULATE INTERACTIONS *
1513              **************************/
1514
1515             if (gmx_mm_any_lt(rsq10,rcutoff2))
1516             {
1517
1518             /* REACTION-FIELD ELECTROSTATICS */
1519             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1520
1521             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1522
1523             fscal            = felec;
1524
1525             fscal            = _mm_and_ps(fscal,cutoff_mask);
1526
1527             /* Calculate temporary vectorial force */
1528             tx               = _mm_mul_ps(fscal,dx10);
1529             ty               = _mm_mul_ps(fscal,dy10);
1530             tz               = _mm_mul_ps(fscal,dz10);
1531
1532             /* Update vectorial force */
1533             fix1             = _mm_add_ps(fix1,tx);
1534             fiy1             = _mm_add_ps(fiy1,ty);
1535             fiz1             = _mm_add_ps(fiz1,tz);
1536
1537             fjx0             = _mm_add_ps(fjx0,tx);
1538             fjy0             = _mm_add_ps(fjy0,ty);
1539             fjz0             = _mm_add_ps(fjz0,tz);
1540             
1541             }
1542
1543             /**************************
1544              * CALCULATE INTERACTIONS *
1545              **************************/
1546
1547             if (gmx_mm_any_lt(rsq11,rcutoff2))
1548             {
1549
1550             /* REACTION-FIELD ELECTROSTATICS */
1551             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1552
1553             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1554
1555             fscal            = felec;
1556
1557             fscal            = _mm_and_ps(fscal,cutoff_mask);
1558
1559             /* Calculate temporary vectorial force */
1560             tx               = _mm_mul_ps(fscal,dx11);
1561             ty               = _mm_mul_ps(fscal,dy11);
1562             tz               = _mm_mul_ps(fscal,dz11);
1563
1564             /* Update vectorial force */
1565             fix1             = _mm_add_ps(fix1,tx);
1566             fiy1             = _mm_add_ps(fiy1,ty);
1567             fiz1             = _mm_add_ps(fiz1,tz);
1568
1569             fjx1             = _mm_add_ps(fjx1,tx);
1570             fjy1             = _mm_add_ps(fjy1,ty);
1571             fjz1             = _mm_add_ps(fjz1,tz);
1572             
1573             }
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             if (gmx_mm_any_lt(rsq12,rcutoff2))
1580             {
1581
1582             /* REACTION-FIELD ELECTROSTATICS */
1583             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1584
1585             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1586
1587             fscal            = felec;
1588
1589             fscal            = _mm_and_ps(fscal,cutoff_mask);
1590
1591             /* Calculate temporary vectorial force */
1592             tx               = _mm_mul_ps(fscal,dx12);
1593             ty               = _mm_mul_ps(fscal,dy12);
1594             tz               = _mm_mul_ps(fscal,dz12);
1595
1596             /* Update vectorial force */
1597             fix1             = _mm_add_ps(fix1,tx);
1598             fiy1             = _mm_add_ps(fiy1,ty);
1599             fiz1             = _mm_add_ps(fiz1,tz);
1600
1601             fjx2             = _mm_add_ps(fjx2,tx);
1602             fjy2             = _mm_add_ps(fjy2,ty);
1603             fjz2             = _mm_add_ps(fjz2,tz);
1604             
1605             }
1606
1607             /**************************
1608              * CALCULATE INTERACTIONS *
1609              **************************/
1610
1611             if (gmx_mm_any_lt(rsq20,rcutoff2))
1612             {
1613
1614             /* REACTION-FIELD ELECTROSTATICS */
1615             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1616
1617             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1618
1619             fscal            = felec;
1620
1621             fscal            = _mm_and_ps(fscal,cutoff_mask);
1622
1623             /* Calculate temporary vectorial force */
1624             tx               = _mm_mul_ps(fscal,dx20);
1625             ty               = _mm_mul_ps(fscal,dy20);
1626             tz               = _mm_mul_ps(fscal,dz20);
1627
1628             /* Update vectorial force */
1629             fix2             = _mm_add_ps(fix2,tx);
1630             fiy2             = _mm_add_ps(fiy2,ty);
1631             fiz2             = _mm_add_ps(fiz2,tz);
1632
1633             fjx0             = _mm_add_ps(fjx0,tx);
1634             fjy0             = _mm_add_ps(fjy0,ty);
1635             fjz0             = _mm_add_ps(fjz0,tz);
1636             
1637             }
1638
1639             /**************************
1640              * CALCULATE INTERACTIONS *
1641              **************************/
1642
1643             if (gmx_mm_any_lt(rsq21,rcutoff2))
1644             {
1645
1646             /* REACTION-FIELD ELECTROSTATICS */
1647             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1648
1649             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1650
1651             fscal            = felec;
1652
1653             fscal            = _mm_and_ps(fscal,cutoff_mask);
1654
1655             /* Calculate temporary vectorial force */
1656             tx               = _mm_mul_ps(fscal,dx21);
1657             ty               = _mm_mul_ps(fscal,dy21);
1658             tz               = _mm_mul_ps(fscal,dz21);
1659
1660             /* Update vectorial force */
1661             fix2             = _mm_add_ps(fix2,tx);
1662             fiy2             = _mm_add_ps(fiy2,ty);
1663             fiz2             = _mm_add_ps(fiz2,tz);
1664
1665             fjx1             = _mm_add_ps(fjx1,tx);
1666             fjy1             = _mm_add_ps(fjy1,ty);
1667             fjz1             = _mm_add_ps(fjz1,tz);
1668             
1669             }
1670
1671             /**************************
1672              * CALCULATE INTERACTIONS *
1673              **************************/
1674
1675             if (gmx_mm_any_lt(rsq22,rcutoff2))
1676             {
1677
1678             /* REACTION-FIELD ELECTROSTATICS */
1679             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1680
1681             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1682
1683             fscal            = felec;
1684
1685             fscal            = _mm_and_ps(fscal,cutoff_mask);
1686
1687             /* Calculate temporary vectorial force */
1688             tx               = _mm_mul_ps(fscal,dx22);
1689             ty               = _mm_mul_ps(fscal,dy22);
1690             tz               = _mm_mul_ps(fscal,dz22);
1691
1692             /* Update vectorial force */
1693             fix2             = _mm_add_ps(fix2,tx);
1694             fiy2             = _mm_add_ps(fiy2,ty);
1695             fiz2             = _mm_add_ps(fiz2,tz);
1696
1697             fjx2             = _mm_add_ps(fjx2,tx);
1698             fjy2             = _mm_add_ps(fjy2,ty);
1699             fjz2             = _mm_add_ps(fjz2,tz);
1700             
1701             }
1702
1703             fjptrA             = f+j_coord_offsetA;
1704             fjptrB             = f+j_coord_offsetB;
1705             fjptrC             = f+j_coord_offsetC;
1706             fjptrD             = f+j_coord_offsetD;
1707
1708             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1709                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1710
1711             /* Inner loop uses 277 flops */
1712         }
1713
1714         if(jidx<j_index_end)
1715         {
1716
1717             /* Get j neighbor index, and coordinate index */
1718             jnrlistA         = jjnr[jidx];
1719             jnrlistB         = jjnr[jidx+1];
1720             jnrlistC         = jjnr[jidx+2];
1721             jnrlistD         = jjnr[jidx+3];
1722             /* Sign of each element will be negative for non-real atoms.
1723              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1724              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1725              */
1726             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1727             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1728             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1729             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1730             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1731             j_coord_offsetA  = DIM*jnrA;
1732             j_coord_offsetB  = DIM*jnrB;
1733             j_coord_offsetC  = DIM*jnrC;
1734             j_coord_offsetD  = DIM*jnrD;
1735
1736             /* load j atom coordinates */
1737             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1738                                               x+j_coord_offsetC,x+j_coord_offsetD,
1739                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1740
1741             /* Calculate displacement vector */
1742             dx00             = _mm_sub_ps(ix0,jx0);
1743             dy00             = _mm_sub_ps(iy0,jy0);
1744             dz00             = _mm_sub_ps(iz0,jz0);
1745             dx01             = _mm_sub_ps(ix0,jx1);
1746             dy01             = _mm_sub_ps(iy0,jy1);
1747             dz01             = _mm_sub_ps(iz0,jz1);
1748             dx02             = _mm_sub_ps(ix0,jx2);
1749             dy02             = _mm_sub_ps(iy0,jy2);
1750             dz02             = _mm_sub_ps(iz0,jz2);
1751             dx10             = _mm_sub_ps(ix1,jx0);
1752             dy10             = _mm_sub_ps(iy1,jy0);
1753             dz10             = _mm_sub_ps(iz1,jz0);
1754             dx11             = _mm_sub_ps(ix1,jx1);
1755             dy11             = _mm_sub_ps(iy1,jy1);
1756             dz11             = _mm_sub_ps(iz1,jz1);
1757             dx12             = _mm_sub_ps(ix1,jx2);
1758             dy12             = _mm_sub_ps(iy1,jy2);
1759             dz12             = _mm_sub_ps(iz1,jz2);
1760             dx20             = _mm_sub_ps(ix2,jx0);
1761             dy20             = _mm_sub_ps(iy2,jy0);
1762             dz20             = _mm_sub_ps(iz2,jz0);
1763             dx21             = _mm_sub_ps(ix2,jx1);
1764             dy21             = _mm_sub_ps(iy2,jy1);
1765             dz21             = _mm_sub_ps(iz2,jz1);
1766             dx22             = _mm_sub_ps(ix2,jx2);
1767             dy22             = _mm_sub_ps(iy2,jy2);
1768             dz22             = _mm_sub_ps(iz2,jz2);
1769
1770             /* Calculate squared distance and things based on it */
1771             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1772             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1773             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1774             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1775             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1776             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1777             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1778             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1779             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1780
1781             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1782             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1783             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1784             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1785             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1786             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1787             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1788             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1789             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1790
1791             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1792             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1793             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1794             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1795             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1796             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1797             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1798             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1799             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1800
1801             fjx0             = _mm_setzero_ps();
1802             fjy0             = _mm_setzero_ps();
1803             fjz0             = _mm_setzero_ps();
1804             fjx1             = _mm_setzero_ps();
1805             fjy1             = _mm_setzero_ps();
1806             fjz1             = _mm_setzero_ps();
1807             fjx2             = _mm_setzero_ps();
1808             fjy2             = _mm_setzero_ps();
1809             fjz2             = _mm_setzero_ps();
1810
1811             /**************************
1812              * CALCULATE INTERACTIONS *
1813              **************************/
1814
1815             if (gmx_mm_any_lt(rsq00,rcutoff2))
1816             {
1817
1818             /* REACTION-FIELD ELECTROSTATICS */
1819             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1820
1821             /* LENNARD-JONES DISPERSION/REPULSION */
1822
1823             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1824             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1825
1826             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1827
1828             fscal            = _mm_add_ps(felec,fvdw);
1829
1830             fscal            = _mm_and_ps(fscal,cutoff_mask);
1831
1832             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1833
1834             /* Calculate temporary vectorial force */
1835             tx               = _mm_mul_ps(fscal,dx00);
1836             ty               = _mm_mul_ps(fscal,dy00);
1837             tz               = _mm_mul_ps(fscal,dz00);
1838
1839             /* Update vectorial force */
1840             fix0             = _mm_add_ps(fix0,tx);
1841             fiy0             = _mm_add_ps(fiy0,ty);
1842             fiz0             = _mm_add_ps(fiz0,tz);
1843
1844             fjx0             = _mm_add_ps(fjx0,tx);
1845             fjy0             = _mm_add_ps(fjy0,ty);
1846             fjz0             = _mm_add_ps(fjz0,tz);
1847             
1848             }
1849
1850             /**************************
1851              * CALCULATE INTERACTIONS *
1852              **************************/
1853
1854             if (gmx_mm_any_lt(rsq01,rcutoff2))
1855             {
1856
1857             /* REACTION-FIELD ELECTROSTATICS */
1858             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1859
1860             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1861
1862             fscal            = felec;
1863
1864             fscal            = _mm_and_ps(fscal,cutoff_mask);
1865
1866             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1867
1868             /* Calculate temporary vectorial force */
1869             tx               = _mm_mul_ps(fscal,dx01);
1870             ty               = _mm_mul_ps(fscal,dy01);
1871             tz               = _mm_mul_ps(fscal,dz01);
1872
1873             /* Update vectorial force */
1874             fix0             = _mm_add_ps(fix0,tx);
1875             fiy0             = _mm_add_ps(fiy0,ty);
1876             fiz0             = _mm_add_ps(fiz0,tz);
1877
1878             fjx1             = _mm_add_ps(fjx1,tx);
1879             fjy1             = _mm_add_ps(fjy1,ty);
1880             fjz1             = _mm_add_ps(fjz1,tz);
1881             
1882             }
1883
1884             /**************************
1885              * CALCULATE INTERACTIONS *
1886              **************************/
1887
1888             if (gmx_mm_any_lt(rsq02,rcutoff2))
1889             {
1890
1891             /* REACTION-FIELD ELECTROSTATICS */
1892             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1893
1894             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1895
1896             fscal            = felec;
1897
1898             fscal            = _mm_and_ps(fscal,cutoff_mask);
1899
1900             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1901
1902             /* Calculate temporary vectorial force */
1903             tx               = _mm_mul_ps(fscal,dx02);
1904             ty               = _mm_mul_ps(fscal,dy02);
1905             tz               = _mm_mul_ps(fscal,dz02);
1906
1907             /* Update vectorial force */
1908             fix0             = _mm_add_ps(fix0,tx);
1909             fiy0             = _mm_add_ps(fiy0,ty);
1910             fiz0             = _mm_add_ps(fiz0,tz);
1911
1912             fjx2             = _mm_add_ps(fjx2,tx);
1913             fjy2             = _mm_add_ps(fjy2,ty);
1914             fjz2             = _mm_add_ps(fjz2,tz);
1915             
1916             }
1917
1918             /**************************
1919              * CALCULATE INTERACTIONS *
1920              **************************/
1921
1922             if (gmx_mm_any_lt(rsq10,rcutoff2))
1923             {
1924
1925             /* REACTION-FIELD ELECTROSTATICS */
1926             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1927
1928             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1929
1930             fscal            = felec;
1931
1932             fscal            = _mm_and_ps(fscal,cutoff_mask);
1933
1934             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1935
1936             /* Calculate temporary vectorial force */
1937             tx               = _mm_mul_ps(fscal,dx10);
1938             ty               = _mm_mul_ps(fscal,dy10);
1939             tz               = _mm_mul_ps(fscal,dz10);
1940
1941             /* Update vectorial force */
1942             fix1             = _mm_add_ps(fix1,tx);
1943             fiy1             = _mm_add_ps(fiy1,ty);
1944             fiz1             = _mm_add_ps(fiz1,tz);
1945
1946             fjx0             = _mm_add_ps(fjx0,tx);
1947             fjy0             = _mm_add_ps(fjy0,ty);
1948             fjz0             = _mm_add_ps(fjz0,tz);
1949             
1950             }
1951
1952             /**************************
1953              * CALCULATE INTERACTIONS *
1954              **************************/
1955
1956             if (gmx_mm_any_lt(rsq11,rcutoff2))
1957             {
1958
1959             /* REACTION-FIELD ELECTROSTATICS */
1960             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1961
1962             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1963
1964             fscal            = felec;
1965
1966             fscal            = _mm_and_ps(fscal,cutoff_mask);
1967
1968             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1969
1970             /* Calculate temporary vectorial force */
1971             tx               = _mm_mul_ps(fscal,dx11);
1972             ty               = _mm_mul_ps(fscal,dy11);
1973             tz               = _mm_mul_ps(fscal,dz11);
1974
1975             /* Update vectorial force */
1976             fix1             = _mm_add_ps(fix1,tx);
1977             fiy1             = _mm_add_ps(fiy1,ty);
1978             fiz1             = _mm_add_ps(fiz1,tz);
1979
1980             fjx1             = _mm_add_ps(fjx1,tx);
1981             fjy1             = _mm_add_ps(fjy1,ty);
1982             fjz1             = _mm_add_ps(fjz1,tz);
1983             
1984             }
1985
1986             /**************************
1987              * CALCULATE INTERACTIONS *
1988              **************************/
1989
1990             if (gmx_mm_any_lt(rsq12,rcutoff2))
1991             {
1992
1993             /* REACTION-FIELD ELECTROSTATICS */
1994             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1995
1996             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1997
1998             fscal            = felec;
1999
2000             fscal            = _mm_and_ps(fscal,cutoff_mask);
2001
2002             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2003
2004             /* Calculate temporary vectorial force */
2005             tx               = _mm_mul_ps(fscal,dx12);
2006             ty               = _mm_mul_ps(fscal,dy12);
2007             tz               = _mm_mul_ps(fscal,dz12);
2008
2009             /* Update vectorial force */
2010             fix1             = _mm_add_ps(fix1,tx);
2011             fiy1             = _mm_add_ps(fiy1,ty);
2012             fiz1             = _mm_add_ps(fiz1,tz);
2013
2014             fjx2             = _mm_add_ps(fjx2,tx);
2015             fjy2             = _mm_add_ps(fjy2,ty);
2016             fjz2             = _mm_add_ps(fjz2,tz);
2017             
2018             }
2019
2020             /**************************
2021              * CALCULATE INTERACTIONS *
2022              **************************/
2023
2024             if (gmx_mm_any_lt(rsq20,rcutoff2))
2025             {
2026
2027             /* REACTION-FIELD ELECTROSTATICS */
2028             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2029
2030             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2031
2032             fscal            = felec;
2033
2034             fscal            = _mm_and_ps(fscal,cutoff_mask);
2035
2036             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2037
2038             /* Calculate temporary vectorial force */
2039             tx               = _mm_mul_ps(fscal,dx20);
2040             ty               = _mm_mul_ps(fscal,dy20);
2041             tz               = _mm_mul_ps(fscal,dz20);
2042
2043             /* Update vectorial force */
2044             fix2             = _mm_add_ps(fix2,tx);
2045             fiy2             = _mm_add_ps(fiy2,ty);
2046             fiz2             = _mm_add_ps(fiz2,tz);
2047
2048             fjx0             = _mm_add_ps(fjx0,tx);
2049             fjy0             = _mm_add_ps(fjy0,ty);
2050             fjz0             = _mm_add_ps(fjz0,tz);
2051             
2052             }
2053
2054             /**************************
2055              * CALCULATE INTERACTIONS *
2056              **************************/
2057
2058             if (gmx_mm_any_lt(rsq21,rcutoff2))
2059             {
2060
2061             /* REACTION-FIELD ELECTROSTATICS */
2062             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2063
2064             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2065
2066             fscal            = felec;
2067
2068             fscal            = _mm_and_ps(fscal,cutoff_mask);
2069
2070             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2071
2072             /* Calculate temporary vectorial force */
2073             tx               = _mm_mul_ps(fscal,dx21);
2074             ty               = _mm_mul_ps(fscal,dy21);
2075             tz               = _mm_mul_ps(fscal,dz21);
2076
2077             /* Update vectorial force */
2078             fix2             = _mm_add_ps(fix2,tx);
2079             fiy2             = _mm_add_ps(fiy2,ty);
2080             fiz2             = _mm_add_ps(fiz2,tz);
2081
2082             fjx1             = _mm_add_ps(fjx1,tx);
2083             fjy1             = _mm_add_ps(fjy1,ty);
2084             fjz1             = _mm_add_ps(fjz1,tz);
2085             
2086             }
2087
2088             /**************************
2089              * CALCULATE INTERACTIONS *
2090              **************************/
2091
2092             if (gmx_mm_any_lt(rsq22,rcutoff2))
2093             {
2094
2095             /* REACTION-FIELD ELECTROSTATICS */
2096             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2097
2098             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2099
2100             fscal            = felec;
2101
2102             fscal            = _mm_and_ps(fscal,cutoff_mask);
2103
2104             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2105
2106             /* Calculate temporary vectorial force */
2107             tx               = _mm_mul_ps(fscal,dx22);
2108             ty               = _mm_mul_ps(fscal,dy22);
2109             tz               = _mm_mul_ps(fscal,dz22);
2110
2111             /* Update vectorial force */
2112             fix2             = _mm_add_ps(fix2,tx);
2113             fiy2             = _mm_add_ps(fiy2,ty);
2114             fiz2             = _mm_add_ps(fiz2,tz);
2115
2116             fjx2             = _mm_add_ps(fjx2,tx);
2117             fjy2             = _mm_add_ps(fjy2,ty);
2118             fjz2             = _mm_add_ps(fjz2,tz);
2119             
2120             }
2121
2122             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2123             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2124             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2125             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2126
2127             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2128                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2129
2130             /* Inner loop uses 277 flops */
2131         }
2132
2133         /* End of innermost loop */
2134
2135         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2136                                               f+i_coord_offset,fshift+i_shift_offset);
2137
2138         /* Increment number of inner iterations */
2139         inneriter                  += j_index_end - j_index_start;
2140
2141         /* Outer loop uses 18 flops */
2142     }
2143
2144     /* Increment number of outer iterations */
2145     outeriter        += nri;
2146
2147     /* Update outer/inner flops */
2148
2149     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
2150 }