093d32a297ca5cd35f2afa8f16b30f46f6bd72a6
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwioffset3;
76     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
96     real             *charge;
97     int              nvdwtype;
98     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99     int              *vdwtype;
100     real             *vdwparam;
101     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
102     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
103     __m128           dummy_mask,cutoff_mask;
104     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
105     __m128           one     = _mm_set1_ps(1.0);
106     __m128           two     = _mm_set1_ps(2.0);
107     x                = xx[0];
108     f                = ff[0];
109
110     nri              = nlist->nri;
111     iinr             = nlist->iinr;
112     jindex           = nlist->jindex;
113     jjnr             = nlist->jjnr;
114     shiftidx         = nlist->shift;
115     gid              = nlist->gid;
116     shiftvec         = fr->shift_vec[0];
117     fshift           = fr->fshift[0];
118     facel            = _mm_set1_ps(fr->epsfac);
119     charge           = mdatoms->chargeA;
120     krf              = _mm_set1_ps(fr->ic->k_rf);
121     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
122     crf              = _mm_set1_ps(fr->ic->c_rf);
123     nvdwtype         = fr->ntype;
124     vdwparam         = fr->nbfp;
125     vdwtype          = mdatoms->typeA;
126
127     /* Setup water-specific parameters */
128     inr              = nlist->iinr[0];
129     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
130     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
131     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
132     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
133
134     jq1              = _mm_set1_ps(charge[inr+1]);
135     jq2              = _mm_set1_ps(charge[inr+2]);
136     jq3              = _mm_set1_ps(charge[inr+3]);
137     vdwjidx0A        = 2*vdwtype[inr+0];
138     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
139     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
140     qq11             = _mm_mul_ps(iq1,jq1);
141     qq12             = _mm_mul_ps(iq1,jq2);
142     qq13             = _mm_mul_ps(iq1,jq3);
143     qq21             = _mm_mul_ps(iq2,jq1);
144     qq22             = _mm_mul_ps(iq2,jq2);
145     qq23             = _mm_mul_ps(iq2,jq3);
146     qq31             = _mm_mul_ps(iq3,jq1);
147     qq32             = _mm_mul_ps(iq3,jq2);
148     qq33             = _mm_mul_ps(iq3,jq3);
149
150     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
151     rcutoff_scalar   = fr->rcoulomb;
152     rcutoff          = _mm_set1_ps(rcutoff_scalar);
153     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
154
155     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
156     rvdw             = _mm_set1_ps(fr->rvdw);
157
158     /* Avoid stupid compiler warnings */
159     jnrA = jnrB = jnrC = jnrD = 0;
160     j_coord_offsetA = 0;
161     j_coord_offsetB = 0;
162     j_coord_offsetC = 0;
163     j_coord_offsetD = 0;
164
165     outeriter        = 0;
166     inneriter        = 0;
167
168     for(iidx=0;iidx<4*DIM;iidx++)
169     {
170         scratch[iidx] = 0.0;
171     }  
172
173     /* Start outer loop over neighborlists */
174     for(iidx=0; iidx<nri; iidx++)
175     {
176         /* Load shift vector for this list */
177         i_shift_offset   = DIM*shiftidx[iidx];
178
179         /* Load limits for loop over neighbors */
180         j_index_start    = jindex[iidx];
181         j_index_end      = jindex[iidx+1];
182
183         /* Get outer coordinate index */
184         inr              = iinr[iidx];
185         i_coord_offset   = DIM*inr;
186
187         /* Load i particle coords and add shift vector */
188         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
189                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
190         
191         fix0             = _mm_setzero_ps();
192         fiy0             = _mm_setzero_ps();
193         fiz0             = _mm_setzero_ps();
194         fix1             = _mm_setzero_ps();
195         fiy1             = _mm_setzero_ps();
196         fiz1             = _mm_setzero_ps();
197         fix2             = _mm_setzero_ps();
198         fiy2             = _mm_setzero_ps();
199         fiz2             = _mm_setzero_ps();
200         fix3             = _mm_setzero_ps();
201         fiy3             = _mm_setzero_ps();
202         fiz3             = _mm_setzero_ps();
203
204         /* Reset potential sums */
205         velecsum         = _mm_setzero_ps();
206         vvdwsum          = _mm_setzero_ps();
207
208         /* Start inner kernel loop */
209         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
210         {
211
212             /* Get j neighbor index, and coordinate index */
213             jnrA             = jjnr[jidx];
214             jnrB             = jjnr[jidx+1];
215             jnrC             = jjnr[jidx+2];
216             jnrD             = jjnr[jidx+3];
217             j_coord_offsetA  = DIM*jnrA;
218             j_coord_offsetB  = DIM*jnrB;
219             j_coord_offsetC  = DIM*jnrC;
220             j_coord_offsetD  = DIM*jnrD;
221
222             /* load j atom coordinates */
223             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
224                                               x+j_coord_offsetC,x+j_coord_offsetD,
225                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
226                                               &jy2,&jz2,&jx3,&jy3,&jz3);
227
228             /* Calculate displacement vector */
229             dx00             = _mm_sub_ps(ix0,jx0);
230             dy00             = _mm_sub_ps(iy0,jy0);
231             dz00             = _mm_sub_ps(iz0,jz0);
232             dx11             = _mm_sub_ps(ix1,jx1);
233             dy11             = _mm_sub_ps(iy1,jy1);
234             dz11             = _mm_sub_ps(iz1,jz1);
235             dx12             = _mm_sub_ps(ix1,jx2);
236             dy12             = _mm_sub_ps(iy1,jy2);
237             dz12             = _mm_sub_ps(iz1,jz2);
238             dx13             = _mm_sub_ps(ix1,jx3);
239             dy13             = _mm_sub_ps(iy1,jy3);
240             dz13             = _mm_sub_ps(iz1,jz3);
241             dx21             = _mm_sub_ps(ix2,jx1);
242             dy21             = _mm_sub_ps(iy2,jy1);
243             dz21             = _mm_sub_ps(iz2,jz1);
244             dx22             = _mm_sub_ps(ix2,jx2);
245             dy22             = _mm_sub_ps(iy2,jy2);
246             dz22             = _mm_sub_ps(iz2,jz2);
247             dx23             = _mm_sub_ps(ix2,jx3);
248             dy23             = _mm_sub_ps(iy2,jy3);
249             dz23             = _mm_sub_ps(iz2,jz3);
250             dx31             = _mm_sub_ps(ix3,jx1);
251             dy31             = _mm_sub_ps(iy3,jy1);
252             dz31             = _mm_sub_ps(iz3,jz1);
253             dx32             = _mm_sub_ps(ix3,jx2);
254             dy32             = _mm_sub_ps(iy3,jy2);
255             dz32             = _mm_sub_ps(iz3,jz2);
256             dx33             = _mm_sub_ps(ix3,jx3);
257             dy33             = _mm_sub_ps(iy3,jy3);
258             dz33             = _mm_sub_ps(iz3,jz3);
259
260             /* Calculate squared distance and things based on it */
261             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
262             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
263             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
264             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
265             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
266             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
267             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
268             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
269             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
270             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
271
272             rinv11           = gmx_mm_invsqrt_ps(rsq11);
273             rinv12           = gmx_mm_invsqrt_ps(rsq12);
274             rinv13           = gmx_mm_invsqrt_ps(rsq13);
275             rinv21           = gmx_mm_invsqrt_ps(rsq21);
276             rinv22           = gmx_mm_invsqrt_ps(rsq22);
277             rinv23           = gmx_mm_invsqrt_ps(rsq23);
278             rinv31           = gmx_mm_invsqrt_ps(rsq31);
279             rinv32           = gmx_mm_invsqrt_ps(rsq32);
280             rinv33           = gmx_mm_invsqrt_ps(rsq33);
281
282             rinvsq00         = gmx_mm_inv_ps(rsq00);
283             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
284             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
285             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
286             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
287             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
288             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
289             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
290             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
291             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
292
293             fjx0             = _mm_setzero_ps();
294             fjy0             = _mm_setzero_ps();
295             fjz0             = _mm_setzero_ps();
296             fjx1             = _mm_setzero_ps();
297             fjy1             = _mm_setzero_ps();
298             fjz1             = _mm_setzero_ps();
299             fjx2             = _mm_setzero_ps();
300             fjy2             = _mm_setzero_ps();
301             fjz2             = _mm_setzero_ps();
302             fjx3             = _mm_setzero_ps();
303             fjy3             = _mm_setzero_ps();
304             fjz3             = _mm_setzero_ps();
305
306             /**************************
307              * CALCULATE INTERACTIONS *
308              **************************/
309
310             if (gmx_mm_any_lt(rsq00,rcutoff2))
311             {
312
313             /* LENNARD-JONES DISPERSION/REPULSION */
314
315             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
316             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
317             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
318             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
319                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
320             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
321
322             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
323
324             /* Update potential sum for this i atom from the interaction with this j atom. */
325             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
326             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
327
328             fscal            = fvdw;
329
330             fscal            = _mm_and_ps(fscal,cutoff_mask);
331
332             /* Calculate temporary vectorial force */
333             tx               = _mm_mul_ps(fscal,dx00);
334             ty               = _mm_mul_ps(fscal,dy00);
335             tz               = _mm_mul_ps(fscal,dz00);
336
337             /* Update vectorial force */
338             fix0             = _mm_add_ps(fix0,tx);
339             fiy0             = _mm_add_ps(fiy0,ty);
340             fiz0             = _mm_add_ps(fiz0,tz);
341
342             fjx0             = _mm_add_ps(fjx0,tx);
343             fjy0             = _mm_add_ps(fjy0,ty);
344             fjz0             = _mm_add_ps(fjz0,tz);
345             
346             }
347
348             /**************************
349              * CALCULATE INTERACTIONS *
350              **************************/
351
352             if (gmx_mm_any_lt(rsq11,rcutoff2))
353             {
354
355             /* REACTION-FIELD ELECTROSTATICS */
356             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
357             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
358
359             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             velec            = _mm_and_ps(velec,cutoff_mask);
363             velecsum         = _mm_add_ps(velecsum,velec);
364
365             fscal            = felec;
366
367             fscal            = _mm_and_ps(fscal,cutoff_mask);
368
369             /* Calculate temporary vectorial force */
370             tx               = _mm_mul_ps(fscal,dx11);
371             ty               = _mm_mul_ps(fscal,dy11);
372             tz               = _mm_mul_ps(fscal,dz11);
373
374             /* Update vectorial force */
375             fix1             = _mm_add_ps(fix1,tx);
376             fiy1             = _mm_add_ps(fiy1,ty);
377             fiz1             = _mm_add_ps(fiz1,tz);
378
379             fjx1             = _mm_add_ps(fjx1,tx);
380             fjy1             = _mm_add_ps(fjy1,ty);
381             fjz1             = _mm_add_ps(fjz1,tz);
382             
383             }
384
385             /**************************
386              * CALCULATE INTERACTIONS *
387              **************************/
388
389             if (gmx_mm_any_lt(rsq12,rcutoff2))
390             {
391
392             /* REACTION-FIELD ELECTROSTATICS */
393             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
394             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
395
396             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
397
398             /* Update potential sum for this i atom from the interaction with this j atom. */
399             velec            = _mm_and_ps(velec,cutoff_mask);
400             velecsum         = _mm_add_ps(velecsum,velec);
401
402             fscal            = felec;
403
404             fscal            = _mm_and_ps(fscal,cutoff_mask);
405
406             /* Calculate temporary vectorial force */
407             tx               = _mm_mul_ps(fscal,dx12);
408             ty               = _mm_mul_ps(fscal,dy12);
409             tz               = _mm_mul_ps(fscal,dz12);
410
411             /* Update vectorial force */
412             fix1             = _mm_add_ps(fix1,tx);
413             fiy1             = _mm_add_ps(fiy1,ty);
414             fiz1             = _mm_add_ps(fiz1,tz);
415
416             fjx2             = _mm_add_ps(fjx2,tx);
417             fjy2             = _mm_add_ps(fjy2,ty);
418             fjz2             = _mm_add_ps(fjz2,tz);
419             
420             }
421
422             /**************************
423              * CALCULATE INTERACTIONS *
424              **************************/
425
426             if (gmx_mm_any_lt(rsq13,rcutoff2))
427             {
428
429             /* REACTION-FIELD ELECTROSTATICS */
430             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
431             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
432
433             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velec            = _mm_and_ps(velec,cutoff_mask);
437             velecsum         = _mm_add_ps(velecsum,velec);
438
439             fscal            = felec;
440
441             fscal            = _mm_and_ps(fscal,cutoff_mask);
442
443             /* Calculate temporary vectorial force */
444             tx               = _mm_mul_ps(fscal,dx13);
445             ty               = _mm_mul_ps(fscal,dy13);
446             tz               = _mm_mul_ps(fscal,dz13);
447
448             /* Update vectorial force */
449             fix1             = _mm_add_ps(fix1,tx);
450             fiy1             = _mm_add_ps(fiy1,ty);
451             fiz1             = _mm_add_ps(fiz1,tz);
452
453             fjx3             = _mm_add_ps(fjx3,tx);
454             fjy3             = _mm_add_ps(fjy3,ty);
455             fjz3             = _mm_add_ps(fjz3,tz);
456             
457             }
458
459             /**************************
460              * CALCULATE INTERACTIONS *
461              **************************/
462
463             if (gmx_mm_any_lt(rsq21,rcutoff2))
464             {
465
466             /* REACTION-FIELD ELECTROSTATICS */
467             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
468             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
469
470             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
471
472             /* Update potential sum for this i atom from the interaction with this j atom. */
473             velec            = _mm_and_ps(velec,cutoff_mask);
474             velecsum         = _mm_add_ps(velecsum,velec);
475
476             fscal            = felec;
477
478             fscal            = _mm_and_ps(fscal,cutoff_mask);
479
480             /* Calculate temporary vectorial force */
481             tx               = _mm_mul_ps(fscal,dx21);
482             ty               = _mm_mul_ps(fscal,dy21);
483             tz               = _mm_mul_ps(fscal,dz21);
484
485             /* Update vectorial force */
486             fix2             = _mm_add_ps(fix2,tx);
487             fiy2             = _mm_add_ps(fiy2,ty);
488             fiz2             = _mm_add_ps(fiz2,tz);
489
490             fjx1             = _mm_add_ps(fjx1,tx);
491             fjy1             = _mm_add_ps(fjy1,ty);
492             fjz1             = _mm_add_ps(fjz1,tz);
493             
494             }
495
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             if (gmx_mm_any_lt(rsq22,rcutoff2))
501             {
502
503             /* REACTION-FIELD ELECTROSTATICS */
504             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
505             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
506
507             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velec            = _mm_and_ps(velec,cutoff_mask);
511             velecsum         = _mm_add_ps(velecsum,velec);
512
513             fscal            = felec;
514
515             fscal            = _mm_and_ps(fscal,cutoff_mask);
516
517             /* Calculate temporary vectorial force */
518             tx               = _mm_mul_ps(fscal,dx22);
519             ty               = _mm_mul_ps(fscal,dy22);
520             tz               = _mm_mul_ps(fscal,dz22);
521
522             /* Update vectorial force */
523             fix2             = _mm_add_ps(fix2,tx);
524             fiy2             = _mm_add_ps(fiy2,ty);
525             fiz2             = _mm_add_ps(fiz2,tz);
526
527             fjx2             = _mm_add_ps(fjx2,tx);
528             fjy2             = _mm_add_ps(fjy2,ty);
529             fjz2             = _mm_add_ps(fjz2,tz);
530             
531             }
532
533             /**************************
534              * CALCULATE INTERACTIONS *
535              **************************/
536
537             if (gmx_mm_any_lt(rsq23,rcutoff2))
538             {
539
540             /* REACTION-FIELD ELECTROSTATICS */
541             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
542             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
543
544             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
545
546             /* Update potential sum for this i atom from the interaction with this j atom. */
547             velec            = _mm_and_ps(velec,cutoff_mask);
548             velecsum         = _mm_add_ps(velecsum,velec);
549
550             fscal            = felec;
551
552             fscal            = _mm_and_ps(fscal,cutoff_mask);
553
554             /* Calculate temporary vectorial force */
555             tx               = _mm_mul_ps(fscal,dx23);
556             ty               = _mm_mul_ps(fscal,dy23);
557             tz               = _mm_mul_ps(fscal,dz23);
558
559             /* Update vectorial force */
560             fix2             = _mm_add_ps(fix2,tx);
561             fiy2             = _mm_add_ps(fiy2,ty);
562             fiz2             = _mm_add_ps(fiz2,tz);
563
564             fjx3             = _mm_add_ps(fjx3,tx);
565             fjy3             = _mm_add_ps(fjy3,ty);
566             fjz3             = _mm_add_ps(fjz3,tz);
567             
568             }
569
570             /**************************
571              * CALCULATE INTERACTIONS *
572              **************************/
573
574             if (gmx_mm_any_lt(rsq31,rcutoff2))
575             {
576
577             /* REACTION-FIELD ELECTROSTATICS */
578             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
579             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
580
581             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
582
583             /* Update potential sum for this i atom from the interaction with this j atom. */
584             velec            = _mm_and_ps(velec,cutoff_mask);
585             velecsum         = _mm_add_ps(velecsum,velec);
586
587             fscal            = felec;
588
589             fscal            = _mm_and_ps(fscal,cutoff_mask);
590
591             /* Calculate temporary vectorial force */
592             tx               = _mm_mul_ps(fscal,dx31);
593             ty               = _mm_mul_ps(fscal,dy31);
594             tz               = _mm_mul_ps(fscal,dz31);
595
596             /* Update vectorial force */
597             fix3             = _mm_add_ps(fix3,tx);
598             fiy3             = _mm_add_ps(fiy3,ty);
599             fiz3             = _mm_add_ps(fiz3,tz);
600
601             fjx1             = _mm_add_ps(fjx1,tx);
602             fjy1             = _mm_add_ps(fjy1,ty);
603             fjz1             = _mm_add_ps(fjz1,tz);
604             
605             }
606
607             /**************************
608              * CALCULATE INTERACTIONS *
609              **************************/
610
611             if (gmx_mm_any_lt(rsq32,rcutoff2))
612             {
613
614             /* REACTION-FIELD ELECTROSTATICS */
615             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
616             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
617
618             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
619
620             /* Update potential sum for this i atom from the interaction with this j atom. */
621             velec            = _mm_and_ps(velec,cutoff_mask);
622             velecsum         = _mm_add_ps(velecsum,velec);
623
624             fscal            = felec;
625
626             fscal            = _mm_and_ps(fscal,cutoff_mask);
627
628             /* Calculate temporary vectorial force */
629             tx               = _mm_mul_ps(fscal,dx32);
630             ty               = _mm_mul_ps(fscal,dy32);
631             tz               = _mm_mul_ps(fscal,dz32);
632
633             /* Update vectorial force */
634             fix3             = _mm_add_ps(fix3,tx);
635             fiy3             = _mm_add_ps(fiy3,ty);
636             fiz3             = _mm_add_ps(fiz3,tz);
637
638             fjx2             = _mm_add_ps(fjx2,tx);
639             fjy2             = _mm_add_ps(fjy2,ty);
640             fjz2             = _mm_add_ps(fjz2,tz);
641             
642             }
643
644             /**************************
645              * CALCULATE INTERACTIONS *
646              **************************/
647
648             if (gmx_mm_any_lt(rsq33,rcutoff2))
649             {
650
651             /* REACTION-FIELD ELECTROSTATICS */
652             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
653             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
654
655             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
656
657             /* Update potential sum for this i atom from the interaction with this j atom. */
658             velec            = _mm_and_ps(velec,cutoff_mask);
659             velecsum         = _mm_add_ps(velecsum,velec);
660
661             fscal            = felec;
662
663             fscal            = _mm_and_ps(fscal,cutoff_mask);
664
665             /* Calculate temporary vectorial force */
666             tx               = _mm_mul_ps(fscal,dx33);
667             ty               = _mm_mul_ps(fscal,dy33);
668             tz               = _mm_mul_ps(fscal,dz33);
669
670             /* Update vectorial force */
671             fix3             = _mm_add_ps(fix3,tx);
672             fiy3             = _mm_add_ps(fiy3,ty);
673             fiz3             = _mm_add_ps(fiz3,tz);
674
675             fjx3             = _mm_add_ps(fjx3,tx);
676             fjy3             = _mm_add_ps(fjy3,ty);
677             fjz3             = _mm_add_ps(fjz3,tz);
678             
679             }
680
681             fjptrA             = f+j_coord_offsetA;
682             fjptrB             = f+j_coord_offsetB;
683             fjptrC             = f+j_coord_offsetC;
684             fjptrD             = f+j_coord_offsetD;
685
686             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
687                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
688                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
689
690             /* Inner loop uses 368 flops */
691         }
692
693         if(jidx<j_index_end)
694         {
695
696             /* Get j neighbor index, and coordinate index */
697             jnrlistA         = jjnr[jidx];
698             jnrlistB         = jjnr[jidx+1];
699             jnrlistC         = jjnr[jidx+2];
700             jnrlistD         = jjnr[jidx+3];
701             /* Sign of each element will be negative for non-real atoms.
702              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
703              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
704              */
705             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
706             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
707             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
708             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
709             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
710             j_coord_offsetA  = DIM*jnrA;
711             j_coord_offsetB  = DIM*jnrB;
712             j_coord_offsetC  = DIM*jnrC;
713             j_coord_offsetD  = DIM*jnrD;
714
715             /* load j atom coordinates */
716             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
717                                               x+j_coord_offsetC,x+j_coord_offsetD,
718                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
719                                               &jy2,&jz2,&jx3,&jy3,&jz3);
720
721             /* Calculate displacement vector */
722             dx00             = _mm_sub_ps(ix0,jx0);
723             dy00             = _mm_sub_ps(iy0,jy0);
724             dz00             = _mm_sub_ps(iz0,jz0);
725             dx11             = _mm_sub_ps(ix1,jx1);
726             dy11             = _mm_sub_ps(iy1,jy1);
727             dz11             = _mm_sub_ps(iz1,jz1);
728             dx12             = _mm_sub_ps(ix1,jx2);
729             dy12             = _mm_sub_ps(iy1,jy2);
730             dz12             = _mm_sub_ps(iz1,jz2);
731             dx13             = _mm_sub_ps(ix1,jx3);
732             dy13             = _mm_sub_ps(iy1,jy3);
733             dz13             = _mm_sub_ps(iz1,jz3);
734             dx21             = _mm_sub_ps(ix2,jx1);
735             dy21             = _mm_sub_ps(iy2,jy1);
736             dz21             = _mm_sub_ps(iz2,jz1);
737             dx22             = _mm_sub_ps(ix2,jx2);
738             dy22             = _mm_sub_ps(iy2,jy2);
739             dz22             = _mm_sub_ps(iz2,jz2);
740             dx23             = _mm_sub_ps(ix2,jx3);
741             dy23             = _mm_sub_ps(iy2,jy3);
742             dz23             = _mm_sub_ps(iz2,jz3);
743             dx31             = _mm_sub_ps(ix3,jx1);
744             dy31             = _mm_sub_ps(iy3,jy1);
745             dz31             = _mm_sub_ps(iz3,jz1);
746             dx32             = _mm_sub_ps(ix3,jx2);
747             dy32             = _mm_sub_ps(iy3,jy2);
748             dz32             = _mm_sub_ps(iz3,jz2);
749             dx33             = _mm_sub_ps(ix3,jx3);
750             dy33             = _mm_sub_ps(iy3,jy3);
751             dz33             = _mm_sub_ps(iz3,jz3);
752
753             /* Calculate squared distance and things based on it */
754             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
755             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
756             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
757             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
758             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
759             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
760             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
761             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
762             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
763             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
764
765             rinv11           = gmx_mm_invsqrt_ps(rsq11);
766             rinv12           = gmx_mm_invsqrt_ps(rsq12);
767             rinv13           = gmx_mm_invsqrt_ps(rsq13);
768             rinv21           = gmx_mm_invsqrt_ps(rsq21);
769             rinv22           = gmx_mm_invsqrt_ps(rsq22);
770             rinv23           = gmx_mm_invsqrt_ps(rsq23);
771             rinv31           = gmx_mm_invsqrt_ps(rsq31);
772             rinv32           = gmx_mm_invsqrt_ps(rsq32);
773             rinv33           = gmx_mm_invsqrt_ps(rsq33);
774
775             rinvsq00         = gmx_mm_inv_ps(rsq00);
776             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
777             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
778             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
779             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
780             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
781             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
782             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
783             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
784             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
785
786             fjx0             = _mm_setzero_ps();
787             fjy0             = _mm_setzero_ps();
788             fjz0             = _mm_setzero_ps();
789             fjx1             = _mm_setzero_ps();
790             fjy1             = _mm_setzero_ps();
791             fjz1             = _mm_setzero_ps();
792             fjx2             = _mm_setzero_ps();
793             fjy2             = _mm_setzero_ps();
794             fjz2             = _mm_setzero_ps();
795             fjx3             = _mm_setzero_ps();
796             fjy3             = _mm_setzero_ps();
797             fjz3             = _mm_setzero_ps();
798
799             /**************************
800              * CALCULATE INTERACTIONS *
801              **************************/
802
803             if (gmx_mm_any_lt(rsq00,rcutoff2))
804             {
805
806             /* LENNARD-JONES DISPERSION/REPULSION */
807
808             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
809             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
810             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
811             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
812                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
813             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
814
815             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
816
817             /* Update potential sum for this i atom from the interaction with this j atom. */
818             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
819             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
820             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
821
822             fscal            = fvdw;
823
824             fscal            = _mm_and_ps(fscal,cutoff_mask);
825
826             fscal            = _mm_andnot_ps(dummy_mask,fscal);
827
828             /* Calculate temporary vectorial force */
829             tx               = _mm_mul_ps(fscal,dx00);
830             ty               = _mm_mul_ps(fscal,dy00);
831             tz               = _mm_mul_ps(fscal,dz00);
832
833             /* Update vectorial force */
834             fix0             = _mm_add_ps(fix0,tx);
835             fiy0             = _mm_add_ps(fiy0,ty);
836             fiz0             = _mm_add_ps(fiz0,tz);
837
838             fjx0             = _mm_add_ps(fjx0,tx);
839             fjy0             = _mm_add_ps(fjy0,ty);
840             fjz0             = _mm_add_ps(fjz0,tz);
841             
842             }
843
844             /**************************
845              * CALCULATE INTERACTIONS *
846              **************************/
847
848             if (gmx_mm_any_lt(rsq11,rcutoff2))
849             {
850
851             /* REACTION-FIELD ELECTROSTATICS */
852             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
853             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
854
855             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
856
857             /* Update potential sum for this i atom from the interaction with this j atom. */
858             velec            = _mm_and_ps(velec,cutoff_mask);
859             velec            = _mm_andnot_ps(dummy_mask,velec);
860             velecsum         = _mm_add_ps(velecsum,velec);
861
862             fscal            = felec;
863
864             fscal            = _mm_and_ps(fscal,cutoff_mask);
865
866             fscal            = _mm_andnot_ps(dummy_mask,fscal);
867
868             /* Calculate temporary vectorial force */
869             tx               = _mm_mul_ps(fscal,dx11);
870             ty               = _mm_mul_ps(fscal,dy11);
871             tz               = _mm_mul_ps(fscal,dz11);
872
873             /* Update vectorial force */
874             fix1             = _mm_add_ps(fix1,tx);
875             fiy1             = _mm_add_ps(fiy1,ty);
876             fiz1             = _mm_add_ps(fiz1,tz);
877
878             fjx1             = _mm_add_ps(fjx1,tx);
879             fjy1             = _mm_add_ps(fjy1,ty);
880             fjz1             = _mm_add_ps(fjz1,tz);
881             
882             }
883
884             /**************************
885              * CALCULATE INTERACTIONS *
886              **************************/
887
888             if (gmx_mm_any_lt(rsq12,rcutoff2))
889             {
890
891             /* REACTION-FIELD ELECTROSTATICS */
892             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
893             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
894
895             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
896
897             /* Update potential sum for this i atom from the interaction with this j atom. */
898             velec            = _mm_and_ps(velec,cutoff_mask);
899             velec            = _mm_andnot_ps(dummy_mask,velec);
900             velecsum         = _mm_add_ps(velecsum,velec);
901
902             fscal            = felec;
903
904             fscal            = _mm_and_ps(fscal,cutoff_mask);
905
906             fscal            = _mm_andnot_ps(dummy_mask,fscal);
907
908             /* Calculate temporary vectorial force */
909             tx               = _mm_mul_ps(fscal,dx12);
910             ty               = _mm_mul_ps(fscal,dy12);
911             tz               = _mm_mul_ps(fscal,dz12);
912
913             /* Update vectorial force */
914             fix1             = _mm_add_ps(fix1,tx);
915             fiy1             = _mm_add_ps(fiy1,ty);
916             fiz1             = _mm_add_ps(fiz1,tz);
917
918             fjx2             = _mm_add_ps(fjx2,tx);
919             fjy2             = _mm_add_ps(fjy2,ty);
920             fjz2             = _mm_add_ps(fjz2,tz);
921             
922             }
923
924             /**************************
925              * CALCULATE INTERACTIONS *
926              **************************/
927
928             if (gmx_mm_any_lt(rsq13,rcutoff2))
929             {
930
931             /* REACTION-FIELD ELECTROSTATICS */
932             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
933             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
934
935             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
936
937             /* Update potential sum for this i atom from the interaction with this j atom. */
938             velec            = _mm_and_ps(velec,cutoff_mask);
939             velec            = _mm_andnot_ps(dummy_mask,velec);
940             velecsum         = _mm_add_ps(velecsum,velec);
941
942             fscal            = felec;
943
944             fscal            = _mm_and_ps(fscal,cutoff_mask);
945
946             fscal            = _mm_andnot_ps(dummy_mask,fscal);
947
948             /* Calculate temporary vectorial force */
949             tx               = _mm_mul_ps(fscal,dx13);
950             ty               = _mm_mul_ps(fscal,dy13);
951             tz               = _mm_mul_ps(fscal,dz13);
952
953             /* Update vectorial force */
954             fix1             = _mm_add_ps(fix1,tx);
955             fiy1             = _mm_add_ps(fiy1,ty);
956             fiz1             = _mm_add_ps(fiz1,tz);
957
958             fjx3             = _mm_add_ps(fjx3,tx);
959             fjy3             = _mm_add_ps(fjy3,ty);
960             fjz3             = _mm_add_ps(fjz3,tz);
961             
962             }
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             if (gmx_mm_any_lt(rsq21,rcutoff2))
969             {
970
971             /* REACTION-FIELD ELECTROSTATICS */
972             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
973             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
974
975             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
976
977             /* Update potential sum for this i atom from the interaction with this j atom. */
978             velec            = _mm_and_ps(velec,cutoff_mask);
979             velec            = _mm_andnot_ps(dummy_mask,velec);
980             velecsum         = _mm_add_ps(velecsum,velec);
981
982             fscal            = felec;
983
984             fscal            = _mm_and_ps(fscal,cutoff_mask);
985
986             fscal            = _mm_andnot_ps(dummy_mask,fscal);
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm_mul_ps(fscal,dx21);
990             ty               = _mm_mul_ps(fscal,dy21);
991             tz               = _mm_mul_ps(fscal,dz21);
992
993             /* Update vectorial force */
994             fix2             = _mm_add_ps(fix2,tx);
995             fiy2             = _mm_add_ps(fiy2,ty);
996             fiz2             = _mm_add_ps(fiz2,tz);
997
998             fjx1             = _mm_add_ps(fjx1,tx);
999             fjy1             = _mm_add_ps(fjy1,ty);
1000             fjz1             = _mm_add_ps(fjz1,tz);
1001             
1002             }
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             if (gmx_mm_any_lt(rsq22,rcutoff2))
1009             {
1010
1011             /* REACTION-FIELD ELECTROSTATICS */
1012             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1013             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1014
1015             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1016
1017             /* Update potential sum for this i atom from the interaction with this j atom. */
1018             velec            = _mm_and_ps(velec,cutoff_mask);
1019             velec            = _mm_andnot_ps(dummy_mask,velec);
1020             velecsum         = _mm_add_ps(velecsum,velec);
1021
1022             fscal            = felec;
1023
1024             fscal            = _mm_and_ps(fscal,cutoff_mask);
1025
1026             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1027
1028             /* Calculate temporary vectorial force */
1029             tx               = _mm_mul_ps(fscal,dx22);
1030             ty               = _mm_mul_ps(fscal,dy22);
1031             tz               = _mm_mul_ps(fscal,dz22);
1032
1033             /* Update vectorial force */
1034             fix2             = _mm_add_ps(fix2,tx);
1035             fiy2             = _mm_add_ps(fiy2,ty);
1036             fiz2             = _mm_add_ps(fiz2,tz);
1037
1038             fjx2             = _mm_add_ps(fjx2,tx);
1039             fjy2             = _mm_add_ps(fjy2,ty);
1040             fjz2             = _mm_add_ps(fjz2,tz);
1041             
1042             }
1043
1044             /**************************
1045              * CALCULATE INTERACTIONS *
1046              **************************/
1047
1048             if (gmx_mm_any_lt(rsq23,rcutoff2))
1049             {
1050
1051             /* REACTION-FIELD ELECTROSTATICS */
1052             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1053             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1054
1055             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1056
1057             /* Update potential sum for this i atom from the interaction with this j atom. */
1058             velec            = _mm_and_ps(velec,cutoff_mask);
1059             velec            = _mm_andnot_ps(dummy_mask,velec);
1060             velecsum         = _mm_add_ps(velecsum,velec);
1061
1062             fscal            = felec;
1063
1064             fscal            = _mm_and_ps(fscal,cutoff_mask);
1065
1066             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1067
1068             /* Calculate temporary vectorial force */
1069             tx               = _mm_mul_ps(fscal,dx23);
1070             ty               = _mm_mul_ps(fscal,dy23);
1071             tz               = _mm_mul_ps(fscal,dz23);
1072
1073             /* Update vectorial force */
1074             fix2             = _mm_add_ps(fix2,tx);
1075             fiy2             = _mm_add_ps(fiy2,ty);
1076             fiz2             = _mm_add_ps(fiz2,tz);
1077
1078             fjx3             = _mm_add_ps(fjx3,tx);
1079             fjy3             = _mm_add_ps(fjy3,ty);
1080             fjz3             = _mm_add_ps(fjz3,tz);
1081             
1082             }
1083
1084             /**************************
1085              * CALCULATE INTERACTIONS *
1086              **************************/
1087
1088             if (gmx_mm_any_lt(rsq31,rcutoff2))
1089             {
1090
1091             /* REACTION-FIELD ELECTROSTATICS */
1092             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1093             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1094
1095             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1096
1097             /* Update potential sum for this i atom from the interaction with this j atom. */
1098             velec            = _mm_and_ps(velec,cutoff_mask);
1099             velec            = _mm_andnot_ps(dummy_mask,velec);
1100             velecsum         = _mm_add_ps(velecsum,velec);
1101
1102             fscal            = felec;
1103
1104             fscal            = _mm_and_ps(fscal,cutoff_mask);
1105
1106             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1107
1108             /* Calculate temporary vectorial force */
1109             tx               = _mm_mul_ps(fscal,dx31);
1110             ty               = _mm_mul_ps(fscal,dy31);
1111             tz               = _mm_mul_ps(fscal,dz31);
1112
1113             /* Update vectorial force */
1114             fix3             = _mm_add_ps(fix3,tx);
1115             fiy3             = _mm_add_ps(fiy3,ty);
1116             fiz3             = _mm_add_ps(fiz3,tz);
1117
1118             fjx1             = _mm_add_ps(fjx1,tx);
1119             fjy1             = _mm_add_ps(fjy1,ty);
1120             fjz1             = _mm_add_ps(fjz1,tz);
1121             
1122             }
1123
1124             /**************************
1125              * CALCULATE INTERACTIONS *
1126              **************************/
1127
1128             if (gmx_mm_any_lt(rsq32,rcutoff2))
1129             {
1130
1131             /* REACTION-FIELD ELECTROSTATICS */
1132             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1133             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1134
1135             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1136
1137             /* Update potential sum for this i atom from the interaction with this j atom. */
1138             velec            = _mm_and_ps(velec,cutoff_mask);
1139             velec            = _mm_andnot_ps(dummy_mask,velec);
1140             velecsum         = _mm_add_ps(velecsum,velec);
1141
1142             fscal            = felec;
1143
1144             fscal            = _mm_and_ps(fscal,cutoff_mask);
1145
1146             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1147
1148             /* Calculate temporary vectorial force */
1149             tx               = _mm_mul_ps(fscal,dx32);
1150             ty               = _mm_mul_ps(fscal,dy32);
1151             tz               = _mm_mul_ps(fscal,dz32);
1152
1153             /* Update vectorial force */
1154             fix3             = _mm_add_ps(fix3,tx);
1155             fiy3             = _mm_add_ps(fiy3,ty);
1156             fiz3             = _mm_add_ps(fiz3,tz);
1157
1158             fjx2             = _mm_add_ps(fjx2,tx);
1159             fjy2             = _mm_add_ps(fjy2,ty);
1160             fjz2             = _mm_add_ps(fjz2,tz);
1161             
1162             }
1163
1164             /**************************
1165              * CALCULATE INTERACTIONS *
1166              **************************/
1167
1168             if (gmx_mm_any_lt(rsq33,rcutoff2))
1169             {
1170
1171             /* REACTION-FIELD ELECTROSTATICS */
1172             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1173             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1174
1175             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1176
1177             /* Update potential sum for this i atom from the interaction with this j atom. */
1178             velec            = _mm_and_ps(velec,cutoff_mask);
1179             velec            = _mm_andnot_ps(dummy_mask,velec);
1180             velecsum         = _mm_add_ps(velecsum,velec);
1181
1182             fscal            = felec;
1183
1184             fscal            = _mm_and_ps(fscal,cutoff_mask);
1185
1186             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1187
1188             /* Calculate temporary vectorial force */
1189             tx               = _mm_mul_ps(fscal,dx33);
1190             ty               = _mm_mul_ps(fscal,dy33);
1191             tz               = _mm_mul_ps(fscal,dz33);
1192
1193             /* Update vectorial force */
1194             fix3             = _mm_add_ps(fix3,tx);
1195             fiy3             = _mm_add_ps(fiy3,ty);
1196             fiz3             = _mm_add_ps(fiz3,tz);
1197
1198             fjx3             = _mm_add_ps(fjx3,tx);
1199             fjy3             = _mm_add_ps(fjy3,ty);
1200             fjz3             = _mm_add_ps(fjz3,tz);
1201             
1202             }
1203
1204             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1205             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1206             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1207             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1208
1209             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1210                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1211                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1212
1213             /* Inner loop uses 368 flops */
1214         }
1215
1216         /* End of innermost loop */
1217
1218         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1219                                               f+i_coord_offset,fshift+i_shift_offset);
1220
1221         ggid                        = gid[iidx];
1222         /* Update potential energies */
1223         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1224         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1225
1226         /* Increment number of inner iterations */
1227         inneriter                  += j_index_end - j_index_start;
1228
1229         /* Outer loop uses 26 flops */
1230     }
1231
1232     /* Increment number of outer iterations */
1233     outeriter        += nri;
1234
1235     /* Update outer/inner flops */
1236
1237     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368);
1238 }
1239 /*
1240  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
1241  * Electrostatics interaction: ReactionField
1242  * VdW interaction:            LennardJones
1243  * Geometry:                   Water4-Water4
1244  * Calculate force/pot:        Force
1245  */
1246 void
1247 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
1248                     (t_nblist * gmx_restrict                nlist,
1249                      rvec * gmx_restrict                    xx,
1250                      rvec * gmx_restrict                    ff,
1251                      t_forcerec * gmx_restrict              fr,
1252                      t_mdatoms * gmx_restrict               mdatoms,
1253                      nb_kernel_data_t * gmx_restrict        kernel_data,
1254                      t_nrnb * gmx_restrict                  nrnb)
1255 {
1256     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1257      * just 0 for non-waters.
1258      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1259      * jnr indices corresponding to data put in the four positions in the SIMD register.
1260      */
1261     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1262     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1263     int              jnrA,jnrB,jnrC,jnrD;
1264     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1265     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1266     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1267     real             rcutoff_scalar;
1268     real             *shiftvec,*fshift,*x,*f;
1269     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1270     real             scratch[4*DIM];
1271     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1272     int              vdwioffset0;
1273     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1274     int              vdwioffset1;
1275     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1276     int              vdwioffset2;
1277     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1278     int              vdwioffset3;
1279     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1280     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1281     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1282     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1283     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1284     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1285     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1286     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1287     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1288     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1289     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1290     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1291     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1292     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1293     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1294     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1295     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1296     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1297     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1298     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1299     real             *charge;
1300     int              nvdwtype;
1301     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1302     int              *vdwtype;
1303     real             *vdwparam;
1304     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1305     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1306     __m128           dummy_mask,cutoff_mask;
1307     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1308     __m128           one     = _mm_set1_ps(1.0);
1309     __m128           two     = _mm_set1_ps(2.0);
1310     x                = xx[0];
1311     f                = ff[0];
1312
1313     nri              = nlist->nri;
1314     iinr             = nlist->iinr;
1315     jindex           = nlist->jindex;
1316     jjnr             = nlist->jjnr;
1317     shiftidx         = nlist->shift;
1318     gid              = nlist->gid;
1319     shiftvec         = fr->shift_vec[0];
1320     fshift           = fr->fshift[0];
1321     facel            = _mm_set1_ps(fr->epsfac);
1322     charge           = mdatoms->chargeA;
1323     krf              = _mm_set1_ps(fr->ic->k_rf);
1324     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1325     crf              = _mm_set1_ps(fr->ic->c_rf);
1326     nvdwtype         = fr->ntype;
1327     vdwparam         = fr->nbfp;
1328     vdwtype          = mdatoms->typeA;
1329
1330     /* Setup water-specific parameters */
1331     inr              = nlist->iinr[0];
1332     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1333     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1334     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1335     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1336
1337     jq1              = _mm_set1_ps(charge[inr+1]);
1338     jq2              = _mm_set1_ps(charge[inr+2]);
1339     jq3              = _mm_set1_ps(charge[inr+3]);
1340     vdwjidx0A        = 2*vdwtype[inr+0];
1341     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1342     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1343     qq11             = _mm_mul_ps(iq1,jq1);
1344     qq12             = _mm_mul_ps(iq1,jq2);
1345     qq13             = _mm_mul_ps(iq1,jq3);
1346     qq21             = _mm_mul_ps(iq2,jq1);
1347     qq22             = _mm_mul_ps(iq2,jq2);
1348     qq23             = _mm_mul_ps(iq2,jq3);
1349     qq31             = _mm_mul_ps(iq3,jq1);
1350     qq32             = _mm_mul_ps(iq3,jq2);
1351     qq33             = _mm_mul_ps(iq3,jq3);
1352
1353     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1354     rcutoff_scalar   = fr->rcoulomb;
1355     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1356     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1357
1358     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1359     rvdw             = _mm_set1_ps(fr->rvdw);
1360
1361     /* Avoid stupid compiler warnings */
1362     jnrA = jnrB = jnrC = jnrD = 0;
1363     j_coord_offsetA = 0;
1364     j_coord_offsetB = 0;
1365     j_coord_offsetC = 0;
1366     j_coord_offsetD = 0;
1367
1368     outeriter        = 0;
1369     inneriter        = 0;
1370
1371     for(iidx=0;iidx<4*DIM;iidx++)
1372     {
1373         scratch[iidx] = 0.0;
1374     }  
1375
1376     /* Start outer loop over neighborlists */
1377     for(iidx=0; iidx<nri; iidx++)
1378     {
1379         /* Load shift vector for this list */
1380         i_shift_offset   = DIM*shiftidx[iidx];
1381
1382         /* Load limits for loop over neighbors */
1383         j_index_start    = jindex[iidx];
1384         j_index_end      = jindex[iidx+1];
1385
1386         /* Get outer coordinate index */
1387         inr              = iinr[iidx];
1388         i_coord_offset   = DIM*inr;
1389
1390         /* Load i particle coords and add shift vector */
1391         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1392                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1393         
1394         fix0             = _mm_setzero_ps();
1395         fiy0             = _mm_setzero_ps();
1396         fiz0             = _mm_setzero_ps();
1397         fix1             = _mm_setzero_ps();
1398         fiy1             = _mm_setzero_ps();
1399         fiz1             = _mm_setzero_ps();
1400         fix2             = _mm_setzero_ps();
1401         fiy2             = _mm_setzero_ps();
1402         fiz2             = _mm_setzero_ps();
1403         fix3             = _mm_setzero_ps();
1404         fiy3             = _mm_setzero_ps();
1405         fiz3             = _mm_setzero_ps();
1406
1407         /* Start inner kernel loop */
1408         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1409         {
1410
1411             /* Get j neighbor index, and coordinate index */
1412             jnrA             = jjnr[jidx];
1413             jnrB             = jjnr[jidx+1];
1414             jnrC             = jjnr[jidx+2];
1415             jnrD             = jjnr[jidx+3];
1416             j_coord_offsetA  = DIM*jnrA;
1417             j_coord_offsetB  = DIM*jnrB;
1418             j_coord_offsetC  = DIM*jnrC;
1419             j_coord_offsetD  = DIM*jnrD;
1420
1421             /* load j atom coordinates */
1422             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1423                                               x+j_coord_offsetC,x+j_coord_offsetD,
1424                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1425                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1426
1427             /* Calculate displacement vector */
1428             dx00             = _mm_sub_ps(ix0,jx0);
1429             dy00             = _mm_sub_ps(iy0,jy0);
1430             dz00             = _mm_sub_ps(iz0,jz0);
1431             dx11             = _mm_sub_ps(ix1,jx1);
1432             dy11             = _mm_sub_ps(iy1,jy1);
1433             dz11             = _mm_sub_ps(iz1,jz1);
1434             dx12             = _mm_sub_ps(ix1,jx2);
1435             dy12             = _mm_sub_ps(iy1,jy2);
1436             dz12             = _mm_sub_ps(iz1,jz2);
1437             dx13             = _mm_sub_ps(ix1,jx3);
1438             dy13             = _mm_sub_ps(iy1,jy3);
1439             dz13             = _mm_sub_ps(iz1,jz3);
1440             dx21             = _mm_sub_ps(ix2,jx1);
1441             dy21             = _mm_sub_ps(iy2,jy1);
1442             dz21             = _mm_sub_ps(iz2,jz1);
1443             dx22             = _mm_sub_ps(ix2,jx2);
1444             dy22             = _mm_sub_ps(iy2,jy2);
1445             dz22             = _mm_sub_ps(iz2,jz2);
1446             dx23             = _mm_sub_ps(ix2,jx3);
1447             dy23             = _mm_sub_ps(iy2,jy3);
1448             dz23             = _mm_sub_ps(iz2,jz3);
1449             dx31             = _mm_sub_ps(ix3,jx1);
1450             dy31             = _mm_sub_ps(iy3,jy1);
1451             dz31             = _mm_sub_ps(iz3,jz1);
1452             dx32             = _mm_sub_ps(ix3,jx2);
1453             dy32             = _mm_sub_ps(iy3,jy2);
1454             dz32             = _mm_sub_ps(iz3,jz2);
1455             dx33             = _mm_sub_ps(ix3,jx3);
1456             dy33             = _mm_sub_ps(iy3,jy3);
1457             dz33             = _mm_sub_ps(iz3,jz3);
1458
1459             /* Calculate squared distance and things based on it */
1460             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1461             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1462             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1463             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1464             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1465             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1466             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1467             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1468             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1469             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1470
1471             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1472             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1473             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1474             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1475             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1476             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1477             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1478             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1479             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1480
1481             rinvsq00         = gmx_mm_inv_ps(rsq00);
1482             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1483             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1484             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1485             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1486             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1487             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1488             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1489             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1490             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1491
1492             fjx0             = _mm_setzero_ps();
1493             fjy0             = _mm_setzero_ps();
1494             fjz0             = _mm_setzero_ps();
1495             fjx1             = _mm_setzero_ps();
1496             fjy1             = _mm_setzero_ps();
1497             fjz1             = _mm_setzero_ps();
1498             fjx2             = _mm_setzero_ps();
1499             fjy2             = _mm_setzero_ps();
1500             fjz2             = _mm_setzero_ps();
1501             fjx3             = _mm_setzero_ps();
1502             fjy3             = _mm_setzero_ps();
1503             fjz3             = _mm_setzero_ps();
1504
1505             /**************************
1506              * CALCULATE INTERACTIONS *
1507              **************************/
1508
1509             if (gmx_mm_any_lt(rsq00,rcutoff2))
1510             {
1511
1512             /* LENNARD-JONES DISPERSION/REPULSION */
1513
1514             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1515             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1516
1517             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1518
1519             fscal            = fvdw;
1520
1521             fscal            = _mm_and_ps(fscal,cutoff_mask);
1522
1523             /* Calculate temporary vectorial force */
1524             tx               = _mm_mul_ps(fscal,dx00);
1525             ty               = _mm_mul_ps(fscal,dy00);
1526             tz               = _mm_mul_ps(fscal,dz00);
1527
1528             /* Update vectorial force */
1529             fix0             = _mm_add_ps(fix0,tx);
1530             fiy0             = _mm_add_ps(fiy0,ty);
1531             fiz0             = _mm_add_ps(fiz0,tz);
1532
1533             fjx0             = _mm_add_ps(fjx0,tx);
1534             fjy0             = _mm_add_ps(fjy0,ty);
1535             fjz0             = _mm_add_ps(fjz0,tz);
1536             
1537             }
1538
1539             /**************************
1540              * CALCULATE INTERACTIONS *
1541              **************************/
1542
1543             if (gmx_mm_any_lt(rsq11,rcutoff2))
1544             {
1545
1546             /* REACTION-FIELD ELECTROSTATICS */
1547             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1548
1549             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1550
1551             fscal            = felec;
1552
1553             fscal            = _mm_and_ps(fscal,cutoff_mask);
1554
1555             /* Calculate temporary vectorial force */
1556             tx               = _mm_mul_ps(fscal,dx11);
1557             ty               = _mm_mul_ps(fscal,dy11);
1558             tz               = _mm_mul_ps(fscal,dz11);
1559
1560             /* Update vectorial force */
1561             fix1             = _mm_add_ps(fix1,tx);
1562             fiy1             = _mm_add_ps(fiy1,ty);
1563             fiz1             = _mm_add_ps(fiz1,tz);
1564
1565             fjx1             = _mm_add_ps(fjx1,tx);
1566             fjy1             = _mm_add_ps(fjy1,ty);
1567             fjz1             = _mm_add_ps(fjz1,tz);
1568             
1569             }
1570
1571             /**************************
1572              * CALCULATE INTERACTIONS *
1573              **************************/
1574
1575             if (gmx_mm_any_lt(rsq12,rcutoff2))
1576             {
1577
1578             /* REACTION-FIELD ELECTROSTATICS */
1579             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1580
1581             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1582
1583             fscal            = felec;
1584
1585             fscal            = _mm_and_ps(fscal,cutoff_mask);
1586
1587             /* Calculate temporary vectorial force */
1588             tx               = _mm_mul_ps(fscal,dx12);
1589             ty               = _mm_mul_ps(fscal,dy12);
1590             tz               = _mm_mul_ps(fscal,dz12);
1591
1592             /* Update vectorial force */
1593             fix1             = _mm_add_ps(fix1,tx);
1594             fiy1             = _mm_add_ps(fiy1,ty);
1595             fiz1             = _mm_add_ps(fiz1,tz);
1596
1597             fjx2             = _mm_add_ps(fjx2,tx);
1598             fjy2             = _mm_add_ps(fjy2,ty);
1599             fjz2             = _mm_add_ps(fjz2,tz);
1600             
1601             }
1602
1603             /**************************
1604              * CALCULATE INTERACTIONS *
1605              **************************/
1606
1607             if (gmx_mm_any_lt(rsq13,rcutoff2))
1608             {
1609
1610             /* REACTION-FIELD ELECTROSTATICS */
1611             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1612
1613             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1614
1615             fscal            = felec;
1616
1617             fscal            = _mm_and_ps(fscal,cutoff_mask);
1618
1619             /* Calculate temporary vectorial force */
1620             tx               = _mm_mul_ps(fscal,dx13);
1621             ty               = _mm_mul_ps(fscal,dy13);
1622             tz               = _mm_mul_ps(fscal,dz13);
1623
1624             /* Update vectorial force */
1625             fix1             = _mm_add_ps(fix1,tx);
1626             fiy1             = _mm_add_ps(fiy1,ty);
1627             fiz1             = _mm_add_ps(fiz1,tz);
1628
1629             fjx3             = _mm_add_ps(fjx3,tx);
1630             fjy3             = _mm_add_ps(fjy3,ty);
1631             fjz3             = _mm_add_ps(fjz3,tz);
1632             
1633             }
1634
1635             /**************************
1636              * CALCULATE INTERACTIONS *
1637              **************************/
1638
1639             if (gmx_mm_any_lt(rsq21,rcutoff2))
1640             {
1641
1642             /* REACTION-FIELD ELECTROSTATICS */
1643             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1644
1645             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1646
1647             fscal            = felec;
1648
1649             fscal            = _mm_and_ps(fscal,cutoff_mask);
1650
1651             /* Calculate temporary vectorial force */
1652             tx               = _mm_mul_ps(fscal,dx21);
1653             ty               = _mm_mul_ps(fscal,dy21);
1654             tz               = _mm_mul_ps(fscal,dz21);
1655
1656             /* Update vectorial force */
1657             fix2             = _mm_add_ps(fix2,tx);
1658             fiy2             = _mm_add_ps(fiy2,ty);
1659             fiz2             = _mm_add_ps(fiz2,tz);
1660
1661             fjx1             = _mm_add_ps(fjx1,tx);
1662             fjy1             = _mm_add_ps(fjy1,ty);
1663             fjz1             = _mm_add_ps(fjz1,tz);
1664             
1665             }
1666
1667             /**************************
1668              * CALCULATE INTERACTIONS *
1669              **************************/
1670
1671             if (gmx_mm_any_lt(rsq22,rcutoff2))
1672             {
1673
1674             /* REACTION-FIELD ELECTROSTATICS */
1675             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1676
1677             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1678
1679             fscal            = felec;
1680
1681             fscal            = _mm_and_ps(fscal,cutoff_mask);
1682
1683             /* Calculate temporary vectorial force */
1684             tx               = _mm_mul_ps(fscal,dx22);
1685             ty               = _mm_mul_ps(fscal,dy22);
1686             tz               = _mm_mul_ps(fscal,dz22);
1687
1688             /* Update vectorial force */
1689             fix2             = _mm_add_ps(fix2,tx);
1690             fiy2             = _mm_add_ps(fiy2,ty);
1691             fiz2             = _mm_add_ps(fiz2,tz);
1692
1693             fjx2             = _mm_add_ps(fjx2,tx);
1694             fjy2             = _mm_add_ps(fjy2,ty);
1695             fjz2             = _mm_add_ps(fjz2,tz);
1696             
1697             }
1698
1699             /**************************
1700              * CALCULATE INTERACTIONS *
1701              **************************/
1702
1703             if (gmx_mm_any_lt(rsq23,rcutoff2))
1704             {
1705
1706             /* REACTION-FIELD ELECTROSTATICS */
1707             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1708
1709             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1710
1711             fscal            = felec;
1712
1713             fscal            = _mm_and_ps(fscal,cutoff_mask);
1714
1715             /* Calculate temporary vectorial force */
1716             tx               = _mm_mul_ps(fscal,dx23);
1717             ty               = _mm_mul_ps(fscal,dy23);
1718             tz               = _mm_mul_ps(fscal,dz23);
1719
1720             /* Update vectorial force */
1721             fix2             = _mm_add_ps(fix2,tx);
1722             fiy2             = _mm_add_ps(fiy2,ty);
1723             fiz2             = _mm_add_ps(fiz2,tz);
1724
1725             fjx3             = _mm_add_ps(fjx3,tx);
1726             fjy3             = _mm_add_ps(fjy3,ty);
1727             fjz3             = _mm_add_ps(fjz3,tz);
1728             
1729             }
1730
1731             /**************************
1732              * CALCULATE INTERACTIONS *
1733              **************************/
1734
1735             if (gmx_mm_any_lt(rsq31,rcutoff2))
1736             {
1737
1738             /* REACTION-FIELD ELECTROSTATICS */
1739             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1740
1741             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1742
1743             fscal            = felec;
1744
1745             fscal            = _mm_and_ps(fscal,cutoff_mask);
1746
1747             /* Calculate temporary vectorial force */
1748             tx               = _mm_mul_ps(fscal,dx31);
1749             ty               = _mm_mul_ps(fscal,dy31);
1750             tz               = _mm_mul_ps(fscal,dz31);
1751
1752             /* Update vectorial force */
1753             fix3             = _mm_add_ps(fix3,tx);
1754             fiy3             = _mm_add_ps(fiy3,ty);
1755             fiz3             = _mm_add_ps(fiz3,tz);
1756
1757             fjx1             = _mm_add_ps(fjx1,tx);
1758             fjy1             = _mm_add_ps(fjy1,ty);
1759             fjz1             = _mm_add_ps(fjz1,tz);
1760             
1761             }
1762
1763             /**************************
1764              * CALCULATE INTERACTIONS *
1765              **************************/
1766
1767             if (gmx_mm_any_lt(rsq32,rcutoff2))
1768             {
1769
1770             /* REACTION-FIELD ELECTROSTATICS */
1771             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1772
1773             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1774
1775             fscal            = felec;
1776
1777             fscal            = _mm_and_ps(fscal,cutoff_mask);
1778
1779             /* Calculate temporary vectorial force */
1780             tx               = _mm_mul_ps(fscal,dx32);
1781             ty               = _mm_mul_ps(fscal,dy32);
1782             tz               = _mm_mul_ps(fscal,dz32);
1783
1784             /* Update vectorial force */
1785             fix3             = _mm_add_ps(fix3,tx);
1786             fiy3             = _mm_add_ps(fiy3,ty);
1787             fiz3             = _mm_add_ps(fiz3,tz);
1788
1789             fjx2             = _mm_add_ps(fjx2,tx);
1790             fjy2             = _mm_add_ps(fjy2,ty);
1791             fjz2             = _mm_add_ps(fjz2,tz);
1792             
1793             }
1794
1795             /**************************
1796              * CALCULATE INTERACTIONS *
1797              **************************/
1798
1799             if (gmx_mm_any_lt(rsq33,rcutoff2))
1800             {
1801
1802             /* REACTION-FIELD ELECTROSTATICS */
1803             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1804
1805             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1806
1807             fscal            = felec;
1808
1809             fscal            = _mm_and_ps(fscal,cutoff_mask);
1810
1811             /* Calculate temporary vectorial force */
1812             tx               = _mm_mul_ps(fscal,dx33);
1813             ty               = _mm_mul_ps(fscal,dy33);
1814             tz               = _mm_mul_ps(fscal,dz33);
1815
1816             /* Update vectorial force */
1817             fix3             = _mm_add_ps(fix3,tx);
1818             fiy3             = _mm_add_ps(fiy3,ty);
1819             fiz3             = _mm_add_ps(fiz3,tz);
1820
1821             fjx3             = _mm_add_ps(fjx3,tx);
1822             fjy3             = _mm_add_ps(fjy3,ty);
1823             fjz3             = _mm_add_ps(fjz3,tz);
1824             
1825             }
1826
1827             fjptrA             = f+j_coord_offsetA;
1828             fjptrB             = f+j_coord_offsetB;
1829             fjptrC             = f+j_coord_offsetC;
1830             fjptrD             = f+j_coord_offsetD;
1831
1832             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1833                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1834                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1835
1836             /* Inner loop uses 303 flops */
1837         }
1838
1839         if(jidx<j_index_end)
1840         {
1841
1842             /* Get j neighbor index, and coordinate index */
1843             jnrlistA         = jjnr[jidx];
1844             jnrlistB         = jjnr[jidx+1];
1845             jnrlistC         = jjnr[jidx+2];
1846             jnrlistD         = jjnr[jidx+3];
1847             /* Sign of each element will be negative for non-real atoms.
1848              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1849              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1850              */
1851             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1852             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1853             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1854             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1855             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1856             j_coord_offsetA  = DIM*jnrA;
1857             j_coord_offsetB  = DIM*jnrB;
1858             j_coord_offsetC  = DIM*jnrC;
1859             j_coord_offsetD  = DIM*jnrD;
1860
1861             /* load j atom coordinates */
1862             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1863                                               x+j_coord_offsetC,x+j_coord_offsetD,
1864                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1865                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1866
1867             /* Calculate displacement vector */
1868             dx00             = _mm_sub_ps(ix0,jx0);
1869             dy00             = _mm_sub_ps(iy0,jy0);
1870             dz00             = _mm_sub_ps(iz0,jz0);
1871             dx11             = _mm_sub_ps(ix1,jx1);
1872             dy11             = _mm_sub_ps(iy1,jy1);
1873             dz11             = _mm_sub_ps(iz1,jz1);
1874             dx12             = _mm_sub_ps(ix1,jx2);
1875             dy12             = _mm_sub_ps(iy1,jy2);
1876             dz12             = _mm_sub_ps(iz1,jz2);
1877             dx13             = _mm_sub_ps(ix1,jx3);
1878             dy13             = _mm_sub_ps(iy1,jy3);
1879             dz13             = _mm_sub_ps(iz1,jz3);
1880             dx21             = _mm_sub_ps(ix2,jx1);
1881             dy21             = _mm_sub_ps(iy2,jy1);
1882             dz21             = _mm_sub_ps(iz2,jz1);
1883             dx22             = _mm_sub_ps(ix2,jx2);
1884             dy22             = _mm_sub_ps(iy2,jy2);
1885             dz22             = _mm_sub_ps(iz2,jz2);
1886             dx23             = _mm_sub_ps(ix2,jx3);
1887             dy23             = _mm_sub_ps(iy2,jy3);
1888             dz23             = _mm_sub_ps(iz2,jz3);
1889             dx31             = _mm_sub_ps(ix3,jx1);
1890             dy31             = _mm_sub_ps(iy3,jy1);
1891             dz31             = _mm_sub_ps(iz3,jz1);
1892             dx32             = _mm_sub_ps(ix3,jx2);
1893             dy32             = _mm_sub_ps(iy3,jy2);
1894             dz32             = _mm_sub_ps(iz3,jz2);
1895             dx33             = _mm_sub_ps(ix3,jx3);
1896             dy33             = _mm_sub_ps(iy3,jy3);
1897             dz33             = _mm_sub_ps(iz3,jz3);
1898
1899             /* Calculate squared distance and things based on it */
1900             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1901             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1902             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1903             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1904             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1905             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1906             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1907             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1908             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1909             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1910
1911             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1912             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1913             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1914             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1915             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1916             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1917             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1918             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1919             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1920
1921             rinvsq00         = gmx_mm_inv_ps(rsq00);
1922             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1923             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1924             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1925             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1926             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1927             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1928             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1929             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1930             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1931
1932             fjx0             = _mm_setzero_ps();
1933             fjy0             = _mm_setzero_ps();
1934             fjz0             = _mm_setzero_ps();
1935             fjx1             = _mm_setzero_ps();
1936             fjy1             = _mm_setzero_ps();
1937             fjz1             = _mm_setzero_ps();
1938             fjx2             = _mm_setzero_ps();
1939             fjy2             = _mm_setzero_ps();
1940             fjz2             = _mm_setzero_ps();
1941             fjx3             = _mm_setzero_ps();
1942             fjy3             = _mm_setzero_ps();
1943             fjz3             = _mm_setzero_ps();
1944
1945             /**************************
1946              * CALCULATE INTERACTIONS *
1947              **************************/
1948
1949             if (gmx_mm_any_lt(rsq00,rcutoff2))
1950             {
1951
1952             /* LENNARD-JONES DISPERSION/REPULSION */
1953
1954             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1955             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1956
1957             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1958
1959             fscal            = fvdw;
1960
1961             fscal            = _mm_and_ps(fscal,cutoff_mask);
1962
1963             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1964
1965             /* Calculate temporary vectorial force */
1966             tx               = _mm_mul_ps(fscal,dx00);
1967             ty               = _mm_mul_ps(fscal,dy00);
1968             tz               = _mm_mul_ps(fscal,dz00);
1969
1970             /* Update vectorial force */
1971             fix0             = _mm_add_ps(fix0,tx);
1972             fiy0             = _mm_add_ps(fiy0,ty);
1973             fiz0             = _mm_add_ps(fiz0,tz);
1974
1975             fjx0             = _mm_add_ps(fjx0,tx);
1976             fjy0             = _mm_add_ps(fjy0,ty);
1977             fjz0             = _mm_add_ps(fjz0,tz);
1978             
1979             }
1980
1981             /**************************
1982              * CALCULATE INTERACTIONS *
1983              **************************/
1984
1985             if (gmx_mm_any_lt(rsq11,rcutoff2))
1986             {
1987
1988             /* REACTION-FIELD ELECTROSTATICS */
1989             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1990
1991             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1992
1993             fscal            = felec;
1994
1995             fscal            = _mm_and_ps(fscal,cutoff_mask);
1996
1997             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1998
1999             /* Calculate temporary vectorial force */
2000             tx               = _mm_mul_ps(fscal,dx11);
2001             ty               = _mm_mul_ps(fscal,dy11);
2002             tz               = _mm_mul_ps(fscal,dz11);
2003
2004             /* Update vectorial force */
2005             fix1             = _mm_add_ps(fix1,tx);
2006             fiy1             = _mm_add_ps(fiy1,ty);
2007             fiz1             = _mm_add_ps(fiz1,tz);
2008
2009             fjx1             = _mm_add_ps(fjx1,tx);
2010             fjy1             = _mm_add_ps(fjy1,ty);
2011             fjz1             = _mm_add_ps(fjz1,tz);
2012             
2013             }
2014
2015             /**************************
2016              * CALCULATE INTERACTIONS *
2017              **************************/
2018
2019             if (gmx_mm_any_lt(rsq12,rcutoff2))
2020             {
2021
2022             /* REACTION-FIELD ELECTROSTATICS */
2023             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2024
2025             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2026
2027             fscal            = felec;
2028
2029             fscal            = _mm_and_ps(fscal,cutoff_mask);
2030
2031             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2032
2033             /* Calculate temporary vectorial force */
2034             tx               = _mm_mul_ps(fscal,dx12);
2035             ty               = _mm_mul_ps(fscal,dy12);
2036             tz               = _mm_mul_ps(fscal,dz12);
2037
2038             /* Update vectorial force */
2039             fix1             = _mm_add_ps(fix1,tx);
2040             fiy1             = _mm_add_ps(fiy1,ty);
2041             fiz1             = _mm_add_ps(fiz1,tz);
2042
2043             fjx2             = _mm_add_ps(fjx2,tx);
2044             fjy2             = _mm_add_ps(fjy2,ty);
2045             fjz2             = _mm_add_ps(fjz2,tz);
2046             
2047             }
2048
2049             /**************************
2050              * CALCULATE INTERACTIONS *
2051              **************************/
2052
2053             if (gmx_mm_any_lt(rsq13,rcutoff2))
2054             {
2055
2056             /* REACTION-FIELD ELECTROSTATICS */
2057             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2058
2059             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2060
2061             fscal            = felec;
2062
2063             fscal            = _mm_and_ps(fscal,cutoff_mask);
2064
2065             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2066
2067             /* Calculate temporary vectorial force */
2068             tx               = _mm_mul_ps(fscal,dx13);
2069             ty               = _mm_mul_ps(fscal,dy13);
2070             tz               = _mm_mul_ps(fscal,dz13);
2071
2072             /* Update vectorial force */
2073             fix1             = _mm_add_ps(fix1,tx);
2074             fiy1             = _mm_add_ps(fiy1,ty);
2075             fiz1             = _mm_add_ps(fiz1,tz);
2076
2077             fjx3             = _mm_add_ps(fjx3,tx);
2078             fjy3             = _mm_add_ps(fjy3,ty);
2079             fjz3             = _mm_add_ps(fjz3,tz);
2080             
2081             }
2082
2083             /**************************
2084              * CALCULATE INTERACTIONS *
2085              **************************/
2086
2087             if (gmx_mm_any_lt(rsq21,rcutoff2))
2088             {
2089
2090             /* REACTION-FIELD ELECTROSTATICS */
2091             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2092
2093             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2094
2095             fscal            = felec;
2096
2097             fscal            = _mm_and_ps(fscal,cutoff_mask);
2098
2099             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2100
2101             /* Calculate temporary vectorial force */
2102             tx               = _mm_mul_ps(fscal,dx21);
2103             ty               = _mm_mul_ps(fscal,dy21);
2104             tz               = _mm_mul_ps(fscal,dz21);
2105
2106             /* Update vectorial force */
2107             fix2             = _mm_add_ps(fix2,tx);
2108             fiy2             = _mm_add_ps(fiy2,ty);
2109             fiz2             = _mm_add_ps(fiz2,tz);
2110
2111             fjx1             = _mm_add_ps(fjx1,tx);
2112             fjy1             = _mm_add_ps(fjy1,ty);
2113             fjz1             = _mm_add_ps(fjz1,tz);
2114             
2115             }
2116
2117             /**************************
2118              * CALCULATE INTERACTIONS *
2119              **************************/
2120
2121             if (gmx_mm_any_lt(rsq22,rcutoff2))
2122             {
2123
2124             /* REACTION-FIELD ELECTROSTATICS */
2125             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2126
2127             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2128
2129             fscal            = felec;
2130
2131             fscal            = _mm_and_ps(fscal,cutoff_mask);
2132
2133             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2134
2135             /* Calculate temporary vectorial force */
2136             tx               = _mm_mul_ps(fscal,dx22);
2137             ty               = _mm_mul_ps(fscal,dy22);
2138             tz               = _mm_mul_ps(fscal,dz22);
2139
2140             /* Update vectorial force */
2141             fix2             = _mm_add_ps(fix2,tx);
2142             fiy2             = _mm_add_ps(fiy2,ty);
2143             fiz2             = _mm_add_ps(fiz2,tz);
2144
2145             fjx2             = _mm_add_ps(fjx2,tx);
2146             fjy2             = _mm_add_ps(fjy2,ty);
2147             fjz2             = _mm_add_ps(fjz2,tz);
2148             
2149             }
2150
2151             /**************************
2152              * CALCULATE INTERACTIONS *
2153              **************************/
2154
2155             if (gmx_mm_any_lt(rsq23,rcutoff2))
2156             {
2157
2158             /* REACTION-FIELD ELECTROSTATICS */
2159             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2160
2161             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2162
2163             fscal            = felec;
2164
2165             fscal            = _mm_and_ps(fscal,cutoff_mask);
2166
2167             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2168
2169             /* Calculate temporary vectorial force */
2170             tx               = _mm_mul_ps(fscal,dx23);
2171             ty               = _mm_mul_ps(fscal,dy23);
2172             tz               = _mm_mul_ps(fscal,dz23);
2173
2174             /* Update vectorial force */
2175             fix2             = _mm_add_ps(fix2,tx);
2176             fiy2             = _mm_add_ps(fiy2,ty);
2177             fiz2             = _mm_add_ps(fiz2,tz);
2178
2179             fjx3             = _mm_add_ps(fjx3,tx);
2180             fjy3             = _mm_add_ps(fjy3,ty);
2181             fjz3             = _mm_add_ps(fjz3,tz);
2182             
2183             }
2184
2185             /**************************
2186              * CALCULATE INTERACTIONS *
2187              **************************/
2188
2189             if (gmx_mm_any_lt(rsq31,rcutoff2))
2190             {
2191
2192             /* REACTION-FIELD ELECTROSTATICS */
2193             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2194
2195             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2196
2197             fscal            = felec;
2198
2199             fscal            = _mm_and_ps(fscal,cutoff_mask);
2200
2201             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2202
2203             /* Calculate temporary vectorial force */
2204             tx               = _mm_mul_ps(fscal,dx31);
2205             ty               = _mm_mul_ps(fscal,dy31);
2206             tz               = _mm_mul_ps(fscal,dz31);
2207
2208             /* Update vectorial force */
2209             fix3             = _mm_add_ps(fix3,tx);
2210             fiy3             = _mm_add_ps(fiy3,ty);
2211             fiz3             = _mm_add_ps(fiz3,tz);
2212
2213             fjx1             = _mm_add_ps(fjx1,tx);
2214             fjy1             = _mm_add_ps(fjy1,ty);
2215             fjz1             = _mm_add_ps(fjz1,tz);
2216             
2217             }
2218
2219             /**************************
2220              * CALCULATE INTERACTIONS *
2221              **************************/
2222
2223             if (gmx_mm_any_lt(rsq32,rcutoff2))
2224             {
2225
2226             /* REACTION-FIELD ELECTROSTATICS */
2227             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2228
2229             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2230
2231             fscal            = felec;
2232
2233             fscal            = _mm_and_ps(fscal,cutoff_mask);
2234
2235             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2236
2237             /* Calculate temporary vectorial force */
2238             tx               = _mm_mul_ps(fscal,dx32);
2239             ty               = _mm_mul_ps(fscal,dy32);
2240             tz               = _mm_mul_ps(fscal,dz32);
2241
2242             /* Update vectorial force */
2243             fix3             = _mm_add_ps(fix3,tx);
2244             fiy3             = _mm_add_ps(fiy3,ty);
2245             fiz3             = _mm_add_ps(fiz3,tz);
2246
2247             fjx2             = _mm_add_ps(fjx2,tx);
2248             fjy2             = _mm_add_ps(fjy2,ty);
2249             fjz2             = _mm_add_ps(fjz2,tz);
2250             
2251             }
2252
2253             /**************************
2254              * CALCULATE INTERACTIONS *
2255              **************************/
2256
2257             if (gmx_mm_any_lt(rsq33,rcutoff2))
2258             {
2259
2260             /* REACTION-FIELD ELECTROSTATICS */
2261             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2262
2263             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2264
2265             fscal            = felec;
2266
2267             fscal            = _mm_and_ps(fscal,cutoff_mask);
2268
2269             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2270
2271             /* Calculate temporary vectorial force */
2272             tx               = _mm_mul_ps(fscal,dx33);
2273             ty               = _mm_mul_ps(fscal,dy33);
2274             tz               = _mm_mul_ps(fscal,dz33);
2275
2276             /* Update vectorial force */
2277             fix3             = _mm_add_ps(fix3,tx);
2278             fiy3             = _mm_add_ps(fiy3,ty);
2279             fiz3             = _mm_add_ps(fiz3,tz);
2280
2281             fjx3             = _mm_add_ps(fjx3,tx);
2282             fjy3             = _mm_add_ps(fjy3,ty);
2283             fjz3             = _mm_add_ps(fjz3,tz);
2284             
2285             }
2286
2287             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2288             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2289             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2290             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2291
2292             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2293                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2294                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2295
2296             /* Inner loop uses 303 flops */
2297         }
2298
2299         /* End of innermost loop */
2300
2301         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2302                                               f+i_coord_offset,fshift+i_shift_offset);
2303
2304         /* Increment number of inner iterations */
2305         inneriter                  += j_index_end - j_index_start;
2306
2307         /* Outer loop uses 24 flops */
2308     }
2309
2310     /* Increment number of outer iterations */
2311     outeriter        += nri;
2312
2313     /* Update outer/inner flops */
2314
2315     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
2316 }