made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_avx_128_fma_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwioffset3;
76     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
96     real             *charge;
97     int              nvdwtype;
98     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99     int              *vdwtype;
100     real             *vdwparam;
101     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
102     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
103     __m128i          ewitab;
104     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
106     real             *ewtab;
107     __m128           dummy_mask,cutoff_mask;
108     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
109     __m128           one     = _mm_set1_ps(1.0);
110     __m128           two     = _mm_set1_ps(2.0);
111     x                = xx[0];
112     f                = ff[0];
113
114     nri              = nlist->nri;
115     iinr             = nlist->iinr;
116     jindex           = nlist->jindex;
117     jjnr             = nlist->jjnr;
118     shiftidx         = nlist->shift;
119     gid              = nlist->gid;
120     shiftvec         = fr->shift_vec[0];
121     fshift           = fr->fshift[0];
122     facel            = _mm_set1_ps(fr->epsfac);
123     charge           = mdatoms->chargeA;
124     nvdwtype         = fr->ntype;
125     vdwparam         = fr->nbfp;
126     vdwtype          = mdatoms->typeA;
127
128     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
129     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
130     beta2            = _mm_mul_ps(beta,beta);
131     beta3            = _mm_mul_ps(beta,beta2);
132     ewtab            = fr->ic->tabq_coul_FDV0;
133     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
134     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
135
136     /* Setup water-specific parameters */
137     inr              = nlist->iinr[0];
138     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
140     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
141     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
142
143     jq1              = _mm_set1_ps(charge[inr+1]);
144     jq2              = _mm_set1_ps(charge[inr+2]);
145     jq3              = _mm_set1_ps(charge[inr+3]);
146     vdwjidx0A        = 2*vdwtype[inr+0];
147     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
148     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
149     qq11             = _mm_mul_ps(iq1,jq1);
150     qq12             = _mm_mul_ps(iq1,jq2);
151     qq13             = _mm_mul_ps(iq1,jq3);
152     qq21             = _mm_mul_ps(iq2,jq1);
153     qq22             = _mm_mul_ps(iq2,jq2);
154     qq23             = _mm_mul_ps(iq2,jq3);
155     qq31             = _mm_mul_ps(iq3,jq1);
156     qq32             = _mm_mul_ps(iq3,jq2);
157     qq33             = _mm_mul_ps(iq3,jq3);
158
159     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
160     rcutoff_scalar   = fr->rcoulomb;
161     rcutoff          = _mm_set1_ps(rcutoff_scalar);
162     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
163
164     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
165     rvdw             = _mm_set1_ps(fr->rvdw);
166
167     /* Avoid stupid compiler warnings */
168     jnrA = jnrB = jnrC = jnrD = 0;
169     j_coord_offsetA = 0;
170     j_coord_offsetB = 0;
171     j_coord_offsetC = 0;
172     j_coord_offsetD = 0;
173
174     outeriter        = 0;
175     inneriter        = 0;
176
177     for(iidx=0;iidx<4*DIM;iidx++)
178     {
179         scratch[iidx] = 0.0;
180     }
181
182     /* Start outer loop over neighborlists */
183     for(iidx=0; iidx<nri; iidx++)
184     {
185         /* Load shift vector for this list */
186         i_shift_offset   = DIM*shiftidx[iidx];
187
188         /* Load limits for loop over neighbors */
189         j_index_start    = jindex[iidx];
190         j_index_end      = jindex[iidx+1];
191
192         /* Get outer coordinate index */
193         inr              = iinr[iidx];
194         i_coord_offset   = DIM*inr;
195
196         /* Load i particle coords and add shift vector */
197         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
198                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
199
200         fix0             = _mm_setzero_ps();
201         fiy0             = _mm_setzero_ps();
202         fiz0             = _mm_setzero_ps();
203         fix1             = _mm_setzero_ps();
204         fiy1             = _mm_setzero_ps();
205         fiz1             = _mm_setzero_ps();
206         fix2             = _mm_setzero_ps();
207         fiy2             = _mm_setzero_ps();
208         fiz2             = _mm_setzero_ps();
209         fix3             = _mm_setzero_ps();
210         fiy3             = _mm_setzero_ps();
211         fiz3             = _mm_setzero_ps();
212
213         /* Reset potential sums */
214         velecsum         = _mm_setzero_ps();
215         vvdwsum          = _mm_setzero_ps();
216
217         /* Start inner kernel loop */
218         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
219         {
220
221             /* Get j neighbor index, and coordinate index */
222             jnrA             = jjnr[jidx];
223             jnrB             = jjnr[jidx+1];
224             jnrC             = jjnr[jidx+2];
225             jnrD             = jjnr[jidx+3];
226             j_coord_offsetA  = DIM*jnrA;
227             j_coord_offsetB  = DIM*jnrB;
228             j_coord_offsetC  = DIM*jnrC;
229             j_coord_offsetD  = DIM*jnrD;
230
231             /* load j atom coordinates */
232             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
233                                               x+j_coord_offsetC,x+j_coord_offsetD,
234                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
235                                               &jy2,&jz2,&jx3,&jy3,&jz3);
236
237             /* Calculate displacement vector */
238             dx00             = _mm_sub_ps(ix0,jx0);
239             dy00             = _mm_sub_ps(iy0,jy0);
240             dz00             = _mm_sub_ps(iz0,jz0);
241             dx11             = _mm_sub_ps(ix1,jx1);
242             dy11             = _mm_sub_ps(iy1,jy1);
243             dz11             = _mm_sub_ps(iz1,jz1);
244             dx12             = _mm_sub_ps(ix1,jx2);
245             dy12             = _mm_sub_ps(iy1,jy2);
246             dz12             = _mm_sub_ps(iz1,jz2);
247             dx13             = _mm_sub_ps(ix1,jx3);
248             dy13             = _mm_sub_ps(iy1,jy3);
249             dz13             = _mm_sub_ps(iz1,jz3);
250             dx21             = _mm_sub_ps(ix2,jx1);
251             dy21             = _mm_sub_ps(iy2,jy1);
252             dz21             = _mm_sub_ps(iz2,jz1);
253             dx22             = _mm_sub_ps(ix2,jx2);
254             dy22             = _mm_sub_ps(iy2,jy2);
255             dz22             = _mm_sub_ps(iz2,jz2);
256             dx23             = _mm_sub_ps(ix2,jx3);
257             dy23             = _mm_sub_ps(iy2,jy3);
258             dz23             = _mm_sub_ps(iz2,jz3);
259             dx31             = _mm_sub_ps(ix3,jx1);
260             dy31             = _mm_sub_ps(iy3,jy1);
261             dz31             = _mm_sub_ps(iz3,jz1);
262             dx32             = _mm_sub_ps(ix3,jx2);
263             dy32             = _mm_sub_ps(iy3,jy2);
264             dz32             = _mm_sub_ps(iz3,jz2);
265             dx33             = _mm_sub_ps(ix3,jx3);
266             dy33             = _mm_sub_ps(iy3,jy3);
267             dz33             = _mm_sub_ps(iz3,jz3);
268
269             /* Calculate squared distance and things based on it */
270             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
271             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
272             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
273             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
274             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
275             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
276             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
277             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
278             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
279             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
280
281             rinv11           = gmx_mm_invsqrt_ps(rsq11);
282             rinv12           = gmx_mm_invsqrt_ps(rsq12);
283             rinv13           = gmx_mm_invsqrt_ps(rsq13);
284             rinv21           = gmx_mm_invsqrt_ps(rsq21);
285             rinv22           = gmx_mm_invsqrt_ps(rsq22);
286             rinv23           = gmx_mm_invsqrt_ps(rsq23);
287             rinv31           = gmx_mm_invsqrt_ps(rsq31);
288             rinv32           = gmx_mm_invsqrt_ps(rsq32);
289             rinv33           = gmx_mm_invsqrt_ps(rsq33);
290
291             rinvsq00         = gmx_mm_inv_ps(rsq00);
292             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
293             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
294             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
295             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
296             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
297             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
298             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
299             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
300             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
301
302             fjx0             = _mm_setzero_ps();
303             fjy0             = _mm_setzero_ps();
304             fjz0             = _mm_setzero_ps();
305             fjx1             = _mm_setzero_ps();
306             fjy1             = _mm_setzero_ps();
307             fjz1             = _mm_setzero_ps();
308             fjx2             = _mm_setzero_ps();
309             fjy2             = _mm_setzero_ps();
310             fjz2             = _mm_setzero_ps();
311             fjx3             = _mm_setzero_ps();
312             fjy3             = _mm_setzero_ps();
313             fjz3             = _mm_setzero_ps();
314
315             /**************************
316              * CALCULATE INTERACTIONS *
317              **************************/
318
319             if (gmx_mm_any_lt(rsq00,rcutoff2))
320             {
321
322             /* LENNARD-JONES DISPERSION/REPULSION */
323
324             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
325             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
326             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
327             vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
328                                           _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
329             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
330
331             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
335             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
336
337             fscal            = fvdw;
338
339             fscal            = _mm_and_ps(fscal,cutoff_mask);
340
341              /* Update vectorial force */
342             fix0             = _mm_macc_ps(dx00,fscal,fix0);
343             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
344             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
345
346             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
347             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
348             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
349
350             }
351
352             /**************************
353              * CALCULATE INTERACTIONS *
354              **************************/
355
356             if (gmx_mm_any_lt(rsq11,rcutoff2))
357             {
358
359             r11              = _mm_mul_ps(rsq11,rinv11);
360
361             /* EWALD ELECTROSTATICS */
362
363             /* Analytical PME correction */
364             zeta2            = _mm_mul_ps(beta2,rsq11);
365             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
366             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
367             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
368             felec            = _mm_mul_ps(qq11,felec);
369             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
370             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
371             velec            = _mm_mul_ps(qq11,velec);
372
373             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
374
375             /* Update potential sum for this i atom from the interaction with this j atom. */
376             velec            = _mm_and_ps(velec,cutoff_mask);
377             velecsum         = _mm_add_ps(velecsum,velec);
378
379             fscal            = felec;
380
381             fscal            = _mm_and_ps(fscal,cutoff_mask);
382
383              /* Update vectorial force */
384             fix1             = _mm_macc_ps(dx11,fscal,fix1);
385             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
386             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
387
388             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
389             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
390             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
391
392             }
393
394             /**************************
395              * CALCULATE INTERACTIONS *
396              **************************/
397
398             if (gmx_mm_any_lt(rsq12,rcutoff2))
399             {
400
401             r12              = _mm_mul_ps(rsq12,rinv12);
402
403             /* EWALD ELECTROSTATICS */
404
405             /* Analytical PME correction */
406             zeta2            = _mm_mul_ps(beta2,rsq12);
407             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
408             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
409             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
410             felec            = _mm_mul_ps(qq12,felec);
411             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
412             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
413             velec            = _mm_mul_ps(qq12,velec);
414
415             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
416
417             /* Update potential sum for this i atom from the interaction with this j atom. */
418             velec            = _mm_and_ps(velec,cutoff_mask);
419             velecsum         = _mm_add_ps(velecsum,velec);
420
421             fscal            = felec;
422
423             fscal            = _mm_and_ps(fscal,cutoff_mask);
424
425              /* Update vectorial force */
426             fix1             = _mm_macc_ps(dx12,fscal,fix1);
427             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
428             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
429
430             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
431             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
432             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
433
434             }
435
436             /**************************
437              * CALCULATE INTERACTIONS *
438              **************************/
439
440             if (gmx_mm_any_lt(rsq13,rcutoff2))
441             {
442
443             r13              = _mm_mul_ps(rsq13,rinv13);
444
445             /* EWALD ELECTROSTATICS */
446
447             /* Analytical PME correction */
448             zeta2            = _mm_mul_ps(beta2,rsq13);
449             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
450             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
451             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
452             felec            = _mm_mul_ps(qq13,felec);
453             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
454             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv13,sh_ewald));
455             velec            = _mm_mul_ps(qq13,velec);
456
457             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
458
459             /* Update potential sum for this i atom from the interaction with this j atom. */
460             velec            = _mm_and_ps(velec,cutoff_mask);
461             velecsum         = _mm_add_ps(velecsum,velec);
462
463             fscal            = felec;
464
465             fscal            = _mm_and_ps(fscal,cutoff_mask);
466
467              /* Update vectorial force */
468             fix1             = _mm_macc_ps(dx13,fscal,fix1);
469             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
470             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
471
472             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
473             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
474             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
475
476             }
477
478             /**************************
479              * CALCULATE INTERACTIONS *
480              **************************/
481
482             if (gmx_mm_any_lt(rsq21,rcutoff2))
483             {
484
485             r21              = _mm_mul_ps(rsq21,rinv21);
486
487             /* EWALD ELECTROSTATICS */
488
489             /* Analytical PME correction */
490             zeta2            = _mm_mul_ps(beta2,rsq21);
491             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
492             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
493             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
494             felec            = _mm_mul_ps(qq21,felec);
495             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
496             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
497             velec            = _mm_mul_ps(qq21,velec);
498
499             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
500
501             /* Update potential sum for this i atom from the interaction with this j atom. */
502             velec            = _mm_and_ps(velec,cutoff_mask);
503             velecsum         = _mm_add_ps(velecsum,velec);
504
505             fscal            = felec;
506
507             fscal            = _mm_and_ps(fscal,cutoff_mask);
508
509              /* Update vectorial force */
510             fix2             = _mm_macc_ps(dx21,fscal,fix2);
511             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
512             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
513
514             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
515             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
516             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
517
518             }
519
520             /**************************
521              * CALCULATE INTERACTIONS *
522              **************************/
523
524             if (gmx_mm_any_lt(rsq22,rcutoff2))
525             {
526
527             r22              = _mm_mul_ps(rsq22,rinv22);
528
529             /* EWALD ELECTROSTATICS */
530
531             /* Analytical PME correction */
532             zeta2            = _mm_mul_ps(beta2,rsq22);
533             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
534             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
535             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
536             felec            = _mm_mul_ps(qq22,felec);
537             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
538             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
539             velec            = _mm_mul_ps(qq22,velec);
540
541             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
542
543             /* Update potential sum for this i atom from the interaction with this j atom. */
544             velec            = _mm_and_ps(velec,cutoff_mask);
545             velecsum         = _mm_add_ps(velecsum,velec);
546
547             fscal            = felec;
548
549             fscal            = _mm_and_ps(fscal,cutoff_mask);
550
551              /* Update vectorial force */
552             fix2             = _mm_macc_ps(dx22,fscal,fix2);
553             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
554             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
555
556             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
557             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
558             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
559
560             }
561
562             /**************************
563              * CALCULATE INTERACTIONS *
564              **************************/
565
566             if (gmx_mm_any_lt(rsq23,rcutoff2))
567             {
568
569             r23              = _mm_mul_ps(rsq23,rinv23);
570
571             /* EWALD ELECTROSTATICS */
572
573             /* Analytical PME correction */
574             zeta2            = _mm_mul_ps(beta2,rsq23);
575             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
576             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
577             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
578             felec            = _mm_mul_ps(qq23,felec);
579             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
580             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv23,sh_ewald));
581             velec            = _mm_mul_ps(qq23,velec);
582
583             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
584
585             /* Update potential sum for this i atom from the interaction with this j atom. */
586             velec            = _mm_and_ps(velec,cutoff_mask);
587             velecsum         = _mm_add_ps(velecsum,velec);
588
589             fscal            = felec;
590
591             fscal            = _mm_and_ps(fscal,cutoff_mask);
592
593              /* Update vectorial force */
594             fix2             = _mm_macc_ps(dx23,fscal,fix2);
595             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
596             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
597
598             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
599             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
600             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
601
602             }
603
604             /**************************
605              * CALCULATE INTERACTIONS *
606              **************************/
607
608             if (gmx_mm_any_lt(rsq31,rcutoff2))
609             {
610
611             r31              = _mm_mul_ps(rsq31,rinv31);
612
613             /* EWALD ELECTROSTATICS */
614
615             /* Analytical PME correction */
616             zeta2            = _mm_mul_ps(beta2,rsq31);
617             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
618             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
619             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
620             felec            = _mm_mul_ps(qq31,felec);
621             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
622             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv31,sh_ewald));
623             velec            = _mm_mul_ps(qq31,velec);
624
625             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
626
627             /* Update potential sum for this i atom from the interaction with this j atom. */
628             velec            = _mm_and_ps(velec,cutoff_mask);
629             velecsum         = _mm_add_ps(velecsum,velec);
630
631             fscal            = felec;
632
633             fscal            = _mm_and_ps(fscal,cutoff_mask);
634
635              /* Update vectorial force */
636             fix3             = _mm_macc_ps(dx31,fscal,fix3);
637             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
638             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
639
640             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
641             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
642             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
643
644             }
645
646             /**************************
647              * CALCULATE INTERACTIONS *
648              **************************/
649
650             if (gmx_mm_any_lt(rsq32,rcutoff2))
651             {
652
653             r32              = _mm_mul_ps(rsq32,rinv32);
654
655             /* EWALD ELECTROSTATICS */
656
657             /* Analytical PME correction */
658             zeta2            = _mm_mul_ps(beta2,rsq32);
659             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
660             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
661             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
662             felec            = _mm_mul_ps(qq32,felec);
663             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
664             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv32,sh_ewald));
665             velec            = _mm_mul_ps(qq32,velec);
666
667             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
668
669             /* Update potential sum for this i atom from the interaction with this j atom. */
670             velec            = _mm_and_ps(velec,cutoff_mask);
671             velecsum         = _mm_add_ps(velecsum,velec);
672
673             fscal            = felec;
674
675             fscal            = _mm_and_ps(fscal,cutoff_mask);
676
677              /* Update vectorial force */
678             fix3             = _mm_macc_ps(dx32,fscal,fix3);
679             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
680             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
681
682             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
683             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
684             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
685
686             }
687
688             /**************************
689              * CALCULATE INTERACTIONS *
690              **************************/
691
692             if (gmx_mm_any_lt(rsq33,rcutoff2))
693             {
694
695             r33              = _mm_mul_ps(rsq33,rinv33);
696
697             /* EWALD ELECTROSTATICS */
698
699             /* Analytical PME correction */
700             zeta2            = _mm_mul_ps(beta2,rsq33);
701             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
702             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
703             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
704             felec            = _mm_mul_ps(qq33,felec);
705             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
706             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv33,sh_ewald));
707             velec            = _mm_mul_ps(qq33,velec);
708
709             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
710
711             /* Update potential sum for this i atom from the interaction with this j atom. */
712             velec            = _mm_and_ps(velec,cutoff_mask);
713             velecsum         = _mm_add_ps(velecsum,velec);
714
715             fscal            = felec;
716
717             fscal            = _mm_and_ps(fscal,cutoff_mask);
718
719              /* Update vectorial force */
720             fix3             = _mm_macc_ps(dx33,fscal,fix3);
721             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
722             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
723
724             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
725             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
726             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
727
728             }
729
730             fjptrA             = f+j_coord_offsetA;
731             fjptrB             = f+j_coord_offsetB;
732             fjptrC             = f+j_coord_offsetC;
733             fjptrD             = f+j_coord_offsetD;
734
735             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
736                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
737                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
738
739             /* Inner loop uses 344 flops */
740         }
741
742         if(jidx<j_index_end)
743         {
744
745             /* Get j neighbor index, and coordinate index */
746             jnrlistA         = jjnr[jidx];
747             jnrlistB         = jjnr[jidx+1];
748             jnrlistC         = jjnr[jidx+2];
749             jnrlistD         = jjnr[jidx+3];
750             /* Sign of each element will be negative for non-real atoms.
751              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
752              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
753              */
754             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
755             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
756             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
757             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
758             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
759             j_coord_offsetA  = DIM*jnrA;
760             j_coord_offsetB  = DIM*jnrB;
761             j_coord_offsetC  = DIM*jnrC;
762             j_coord_offsetD  = DIM*jnrD;
763
764             /* load j atom coordinates */
765             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
766                                               x+j_coord_offsetC,x+j_coord_offsetD,
767                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
768                                               &jy2,&jz2,&jx3,&jy3,&jz3);
769
770             /* Calculate displacement vector */
771             dx00             = _mm_sub_ps(ix0,jx0);
772             dy00             = _mm_sub_ps(iy0,jy0);
773             dz00             = _mm_sub_ps(iz0,jz0);
774             dx11             = _mm_sub_ps(ix1,jx1);
775             dy11             = _mm_sub_ps(iy1,jy1);
776             dz11             = _mm_sub_ps(iz1,jz1);
777             dx12             = _mm_sub_ps(ix1,jx2);
778             dy12             = _mm_sub_ps(iy1,jy2);
779             dz12             = _mm_sub_ps(iz1,jz2);
780             dx13             = _mm_sub_ps(ix1,jx3);
781             dy13             = _mm_sub_ps(iy1,jy3);
782             dz13             = _mm_sub_ps(iz1,jz3);
783             dx21             = _mm_sub_ps(ix2,jx1);
784             dy21             = _mm_sub_ps(iy2,jy1);
785             dz21             = _mm_sub_ps(iz2,jz1);
786             dx22             = _mm_sub_ps(ix2,jx2);
787             dy22             = _mm_sub_ps(iy2,jy2);
788             dz22             = _mm_sub_ps(iz2,jz2);
789             dx23             = _mm_sub_ps(ix2,jx3);
790             dy23             = _mm_sub_ps(iy2,jy3);
791             dz23             = _mm_sub_ps(iz2,jz3);
792             dx31             = _mm_sub_ps(ix3,jx1);
793             dy31             = _mm_sub_ps(iy3,jy1);
794             dz31             = _mm_sub_ps(iz3,jz1);
795             dx32             = _mm_sub_ps(ix3,jx2);
796             dy32             = _mm_sub_ps(iy3,jy2);
797             dz32             = _mm_sub_ps(iz3,jz2);
798             dx33             = _mm_sub_ps(ix3,jx3);
799             dy33             = _mm_sub_ps(iy3,jy3);
800             dz33             = _mm_sub_ps(iz3,jz3);
801
802             /* Calculate squared distance and things based on it */
803             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
804             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
805             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
806             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
807             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
808             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
809             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
810             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
811             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
812             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
813
814             rinv11           = gmx_mm_invsqrt_ps(rsq11);
815             rinv12           = gmx_mm_invsqrt_ps(rsq12);
816             rinv13           = gmx_mm_invsqrt_ps(rsq13);
817             rinv21           = gmx_mm_invsqrt_ps(rsq21);
818             rinv22           = gmx_mm_invsqrt_ps(rsq22);
819             rinv23           = gmx_mm_invsqrt_ps(rsq23);
820             rinv31           = gmx_mm_invsqrt_ps(rsq31);
821             rinv32           = gmx_mm_invsqrt_ps(rsq32);
822             rinv33           = gmx_mm_invsqrt_ps(rsq33);
823
824             rinvsq00         = gmx_mm_inv_ps(rsq00);
825             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
826             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
827             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
828             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
829             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
830             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
831             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
832             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
833             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
834
835             fjx0             = _mm_setzero_ps();
836             fjy0             = _mm_setzero_ps();
837             fjz0             = _mm_setzero_ps();
838             fjx1             = _mm_setzero_ps();
839             fjy1             = _mm_setzero_ps();
840             fjz1             = _mm_setzero_ps();
841             fjx2             = _mm_setzero_ps();
842             fjy2             = _mm_setzero_ps();
843             fjz2             = _mm_setzero_ps();
844             fjx3             = _mm_setzero_ps();
845             fjy3             = _mm_setzero_ps();
846             fjz3             = _mm_setzero_ps();
847
848             /**************************
849              * CALCULATE INTERACTIONS *
850              **************************/
851
852             if (gmx_mm_any_lt(rsq00,rcutoff2))
853             {
854
855             /* LENNARD-JONES DISPERSION/REPULSION */
856
857             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
858             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
859             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
860             vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
861                                           _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
862             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
863
864             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
865
866             /* Update potential sum for this i atom from the interaction with this j atom. */
867             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
868             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
869             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
870
871             fscal            = fvdw;
872
873             fscal            = _mm_and_ps(fscal,cutoff_mask);
874
875             fscal            = _mm_andnot_ps(dummy_mask,fscal);
876
877              /* Update vectorial force */
878             fix0             = _mm_macc_ps(dx00,fscal,fix0);
879             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
880             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
881
882             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
883             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
884             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
885
886             }
887
888             /**************************
889              * CALCULATE INTERACTIONS *
890              **************************/
891
892             if (gmx_mm_any_lt(rsq11,rcutoff2))
893             {
894
895             r11              = _mm_mul_ps(rsq11,rinv11);
896             r11              = _mm_andnot_ps(dummy_mask,r11);
897
898             /* EWALD ELECTROSTATICS */
899
900             /* Analytical PME correction */
901             zeta2            = _mm_mul_ps(beta2,rsq11);
902             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
903             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
904             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
905             felec            = _mm_mul_ps(qq11,felec);
906             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
907             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
908             velec            = _mm_mul_ps(qq11,velec);
909
910             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
911
912             /* Update potential sum for this i atom from the interaction with this j atom. */
913             velec            = _mm_and_ps(velec,cutoff_mask);
914             velec            = _mm_andnot_ps(dummy_mask,velec);
915             velecsum         = _mm_add_ps(velecsum,velec);
916
917             fscal            = felec;
918
919             fscal            = _mm_and_ps(fscal,cutoff_mask);
920
921             fscal            = _mm_andnot_ps(dummy_mask,fscal);
922
923              /* Update vectorial force */
924             fix1             = _mm_macc_ps(dx11,fscal,fix1);
925             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
926             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
927
928             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
929             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
930             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
931
932             }
933
934             /**************************
935              * CALCULATE INTERACTIONS *
936              **************************/
937
938             if (gmx_mm_any_lt(rsq12,rcutoff2))
939             {
940
941             r12              = _mm_mul_ps(rsq12,rinv12);
942             r12              = _mm_andnot_ps(dummy_mask,r12);
943
944             /* EWALD ELECTROSTATICS */
945
946             /* Analytical PME correction */
947             zeta2            = _mm_mul_ps(beta2,rsq12);
948             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
949             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
950             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
951             felec            = _mm_mul_ps(qq12,felec);
952             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
953             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
954             velec            = _mm_mul_ps(qq12,velec);
955
956             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
957
958             /* Update potential sum for this i atom from the interaction with this j atom. */
959             velec            = _mm_and_ps(velec,cutoff_mask);
960             velec            = _mm_andnot_ps(dummy_mask,velec);
961             velecsum         = _mm_add_ps(velecsum,velec);
962
963             fscal            = felec;
964
965             fscal            = _mm_and_ps(fscal,cutoff_mask);
966
967             fscal            = _mm_andnot_ps(dummy_mask,fscal);
968
969              /* Update vectorial force */
970             fix1             = _mm_macc_ps(dx12,fscal,fix1);
971             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
972             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
973
974             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
975             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
976             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
977
978             }
979
980             /**************************
981              * CALCULATE INTERACTIONS *
982              **************************/
983
984             if (gmx_mm_any_lt(rsq13,rcutoff2))
985             {
986
987             r13              = _mm_mul_ps(rsq13,rinv13);
988             r13              = _mm_andnot_ps(dummy_mask,r13);
989
990             /* EWALD ELECTROSTATICS */
991
992             /* Analytical PME correction */
993             zeta2            = _mm_mul_ps(beta2,rsq13);
994             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
995             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
996             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
997             felec            = _mm_mul_ps(qq13,felec);
998             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
999             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv13,sh_ewald));
1000             velec            = _mm_mul_ps(qq13,velec);
1001
1002             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1003
1004             /* Update potential sum for this i atom from the interaction with this j atom. */
1005             velec            = _mm_and_ps(velec,cutoff_mask);
1006             velec            = _mm_andnot_ps(dummy_mask,velec);
1007             velecsum         = _mm_add_ps(velecsum,velec);
1008
1009             fscal            = felec;
1010
1011             fscal            = _mm_and_ps(fscal,cutoff_mask);
1012
1013             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1014
1015              /* Update vectorial force */
1016             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1017             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1018             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1019
1020             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1021             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1022             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1023
1024             }
1025
1026             /**************************
1027              * CALCULATE INTERACTIONS *
1028              **************************/
1029
1030             if (gmx_mm_any_lt(rsq21,rcutoff2))
1031             {
1032
1033             r21              = _mm_mul_ps(rsq21,rinv21);
1034             r21              = _mm_andnot_ps(dummy_mask,r21);
1035
1036             /* EWALD ELECTROSTATICS */
1037
1038             /* Analytical PME correction */
1039             zeta2            = _mm_mul_ps(beta2,rsq21);
1040             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1041             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1042             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1043             felec            = _mm_mul_ps(qq21,felec);
1044             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1045             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
1046             velec            = _mm_mul_ps(qq21,velec);
1047
1048             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1049
1050             /* Update potential sum for this i atom from the interaction with this j atom. */
1051             velec            = _mm_and_ps(velec,cutoff_mask);
1052             velec            = _mm_andnot_ps(dummy_mask,velec);
1053             velecsum         = _mm_add_ps(velecsum,velec);
1054
1055             fscal            = felec;
1056
1057             fscal            = _mm_and_ps(fscal,cutoff_mask);
1058
1059             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1060
1061              /* Update vectorial force */
1062             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1063             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1064             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1065
1066             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1067             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1068             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1069
1070             }
1071
1072             /**************************
1073              * CALCULATE INTERACTIONS *
1074              **************************/
1075
1076             if (gmx_mm_any_lt(rsq22,rcutoff2))
1077             {
1078
1079             r22              = _mm_mul_ps(rsq22,rinv22);
1080             r22              = _mm_andnot_ps(dummy_mask,r22);
1081
1082             /* EWALD ELECTROSTATICS */
1083
1084             /* Analytical PME correction */
1085             zeta2            = _mm_mul_ps(beta2,rsq22);
1086             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1087             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1088             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1089             felec            = _mm_mul_ps(qq22,felec);
1090             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1091             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
1092             velec            = _mm_mul_ps(qq22,velec);
1093
1094             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1095
1096             /* Update potential sum for this i atom from the interaction with this j atom. */
1097             velec            = _mm_and_ps(velec,cutoff_mask);
1098             velec            = _mm_andnot_ps(dummy_mask,velec);
1099             velecsum         = _mm_add_ps(velecsum,velec);
1100
1101             fscal            = felec;
1102
1103             fscal            = _mm_and_ps(fscal,cutoff_mask);
1104
1105             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1106
1107              /* Update vectorial force */
1108             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1109             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1110             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1111
1112             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1113             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1114             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1115
1116             }
1117
1118             /**************************
1119              * CALCULATE INTERACTIONS *
1120              **************************/
1121
1122             if (gmx_mm_any_lt(rsq23,rcutoff2))
1123             {
1124
1125             r23              = _mm_mul_ps(rsq23,rinv23);
1126             r23              = _mm_andnot_ps(dummy_mask,r23);
1127
1128             /* EWALD ELECTROSTATICS */
1129
1130             /* Analytical PME correction */
1131             zeta2            = _mm_mul_ps(beta2,rsq23);
1132             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1133             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1134             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1135             felec            = _mm_mul_ps(qq23,felec);
1136             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1137             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv23,sh_ewald));
1138             velec            = _mm_mul_ps(qq23,velec);
1139
1140             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1141
1142             /* Update potential sum for this i atom from the interaction with this j atom. */
1143             velec            = _mm_and_ps(velec,cutoff_mask);
1144             velec            = _mm_andnot_ps(dummy_mask,velec);
1145             velecsum         = _mm_add_ps(velecsum,velec);
1146
1147             fscal            = felec;
1148
1149             fscal            = _mm_and_ps(fscal,cutoff_mask);
1150
1151             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1152
1153              /* Update vectorial force */
1154             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1155             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1156             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1157
1158             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1159             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1160             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1161
1162             }
1163
1164             /**************************
1165              * CALCULATE INTERACTIONS *
1166              **************************/
1167
1168             if (gmx_mm_any_lt(rsq31,rcutoff2))
1169             {
1170
1171             r31              = _mm_mul_ps(rsq31,rinv31);
1172             r31              = _mm_andnot_ps(dummy_mask,r31);
1173
1174             /* EWALD ELECTROSTATICS */
1175
1176             /* Analytical PME correction */
1177             zeta2            = _mm_mul_ps(beta2,rsq31);
1178             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1179             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1180             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1181             felec            = _mm_mul_ps(qq31,felec);
1182             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1183             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv31,sh_ewald));
1184             velec            = _mm_mul_ps(qq31,velec);
1185
1186             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1187
1188             /* Update potential sum for this i atom from the interaction with this j atom. */
1189             velec            = _mm_and_ps(velec,cutoff_mask);
1190             velec            = _mm_andnot_ps(dummy_mask,velec);
1191             velecsum         = _mm_add_ps(velecsum,velec);
1192
1193             fscal            = felec;
1194
1195             fscal            = _mm_and_ps(fscal,cutoff_mask);
1196
1197             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1198
1199              /* Update vectorial force */
1200             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1201             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1202             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1203
1204             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1205             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1206             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1207
1208             }
1209
1210             /**************************
1211              * CALCULATE INTERACTIONS *
1212              **************************/
1213
1214             if (gmx_mm_any_lt(rsq32,rcutoff2))
1215             {
1216
1217             r32              = _mm_mul_ps(rsq32,rinv32);
1218             r32              = _mm_andnot_ps(dummy_mask,r32);
1219
1220             /* EWALD ELECTROSTATICS */
1221
1222             /* Analytical PME correction */
1223             zeta2            = _mm_mul_ps(beta2,rsq32);
1224             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1225             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1226             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1227             felec            = _mm_mul_ps(qq32,felec);
1228             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1229             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv32,sh_ewald));
1230             velec            = _mm_mul_ps(qq32,velec);
1231
1232             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1233
1234             /* Update potential sum for this i atom from the interaction with this j atom. */
1235             velec            = _mm_and_ps(velec,cutoff_mask);
1236             velec            = _mm_andnot_ps(dummy_mask,velec);
1237             velecsum         = _mm_add_ps(velecsum,velec);
1238
1239             fscal            = felec;
1240
1241             fscal            = _mm_and_ps(fscal,cutoff_mask);
1242
1243             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1244
1245              /* Update vectorial force */
1246             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1247             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1248             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1249
1250             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1251             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1252             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1253
1254             }
1255
1256             /**************************
1257              * CALCULATE INTERACTIONS *
1258              **************************/
1259
1260             if (gmx_mm_any_lt(rsq33,rcutoff2))
1261             {
1262
1263             r33              = _mm_mul_ps(rsq33,rinv33);
1264             r33              = _mm_andnot_ps(dummy_mask,r33);
1265
1266             /* EWALD ELECTROSTATICS */
1267
1268             /* Analytical PME correction */
1269             zeta2            = _mm_mul_ps(beta2,rsq33);
1270             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1271             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1272             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1273             felec            = _mm_mul_ps(qq33,felec);
1274             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1275             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv33,sh_ewald));
1276             velec            = _mm_mul_ps(qq33,velec);
1277
1278             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1279
1280             /* Update potential sum for this i atom from the interaction with this j atom. */
1281             velec            = _mm_and_ps(velec,cutoff_mask);
1282             velec            = _mm_andnot_ps(dummy_mask,velec);
1283             velecsum         = _mm_add_ps(velecsum,velec);
1284
1285             fscal            = felec;
1286
1287             fscal            = _mm_and_ps(fscal,cutoff_mask);
1288
1289             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1290
1291              /* Update vectorial force */
1292             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1293             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1294             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1295
1296             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1297             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1298             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1299
1300             }
1301
1302             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1303             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1304             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1305             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1306
1307             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1308                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1309                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1310
1311             /* Inner loop uses 353 flops */
1312         }
1313
1314         /* End of innermost loop */
1315
1316         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1317                                               f+i_coord_offset,fshift+i_shift_offset);
1318
1319         ggid                        = gid[iidx];
1320         /* Update potential energies */
1321         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1322         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1323
1324         /* Increment number of inner iterations */
1325         inneriter                  += j_index_end - j_index_start;
1326
1327         /* Outer loop uses 26 flops */
1328     }
1329
1330     /* Increment number of outer iterations */
1331     outeriter        += nri;
1332
1333     /* Update outer/inner flops */
1334
1335     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*353);
1336 }
1337 /*
1338  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_single
1339  * Electrostatics interaction: Ewald
1340  * VdW interaction:            LennardJones
1341  * Geometry:                   Water4-Water4
1342  * Calculate force/pot:        Force
1343  */
1344 void
1345 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_single
1346                     (t_nblist * gmx_restrict                nlist,
1347                      rvec * gmx_restrict                    xx,
1348                      rvec * gmx_restrict                    ff,
1349                      t_forcerec * gmx_restrict              fr,
1350                      t_mdatoms * gmx_restrict               mdatoms,
1351                      nb_kernel_data_t * gmx_restrict        kernel_data,
1352                      t_nrnb * gmx_restrict                  nrnb)
1353 {
1354     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1355      * just 0 for non-waters.
1356      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1357      * jnr indices corresponding to data put in the four positions in the SIMD register.
1358      */
1359     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1360     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1361     int              jnrA,jnrB,jnrC,jnrD;
1362     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1363     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1364     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1365     real             rcutoff_scalar;
1366     real             *shiftvec,*fshift,*x,*f;
1367     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1368     real             scratch[4*DIM];
1369     __m128           fscal,rcutoff,rcutoff2,jidxall;
1370     int              vdwioffset0;
1371     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1372     int              vdwioffset1;
1373     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1374     int              vdwioffset2;
1375     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1376     int              vdwioffset3;
1377     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1378     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1379     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1380     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1381     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1382     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1383     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1384     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1385     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1386     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1387     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1388     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1389     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1390     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1391     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1392     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1393     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1394     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1395     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1396     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1397     real             *charge;
1398     int              nvdwtype;
1399     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1400     int              *vdwtype;
1401     real             *vdwparam;
1402     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1403     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1404     __m128i          ewitab;
1405     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1406     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1407     real             *ewtab;
1408     __m128           dummy_mask,cutoff_mask;
1409     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1410     __m128           one     = _mm_set1_ps(1.0);
1411     __m128           two     = _mm_set1_ps(2.0);
1412     x                = xx[0];
1413     f                = ff[0];
1414
1415     nri              = nlist->nri;
1416     iinr             = nlist->iinr;
1417     jindex           = nlist->jindex;
1418     jjnr             = nlist->jjnr;
1419     shiftidx         = nlist->shift;
1420     gid              = nlist->gid;
1421     shiftvec         = fr->shift_vec[0];
1422     fshift           = fr->fshift[0];
1423     facel            = _mm_set1_ps(fr->epsfac);
1424     charge           = mdatoms->chargeA;
1425     nvdwtype         = fr->ntype;
1426     vdwparam         = fr->nbfp;
1427     vdwtype          = mdatoms->typeA;
1428
1429     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1430     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
1431     beta2            = _mm_mul_ps(beta,beta);
1432     beta3            = _mm_mul_ps(beta,beta2);
1433     ewtab            = fr->ic->tabq_coul_F;
1434     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1435     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1436
1437     /* Setup water-specific parameters */
1438     inr              = nlist->iinr[0];
1439     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1440     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1441     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1442     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1443
1444     jq1              = _mm_set1_ps(charge[inr+1]);
1445     jq2              = _mm_set1_ps(charge[inr+2]);
1446     jq3              = _mm_set1_ps(charge[inr+3]);
1447     vdwjidx0A        = 2*vdwtype[inr+0];
1448     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1449     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1450     qq11             = _mm_mul_ps(iq1,jq1);
1451     qq12             = _mm_mul_ps(iq1,jq2);
1452     qq13             = _mm_mul_ps(iq1,jq3);
1453     qq21             = _mm_mul_ps(iq2,jq1);
1454     qq22             = _mm_mul_ps(iq2,jq2);
1455     qq23             = _mm_mul_ps(iq2,jq3);
1456     qq31             = _mm_mul_ps(iq3,jq1);
1457     qq32             = _mm_mul_ps(iq3,jq2);
1458     qq33             = _mm_mul_ps(iq3,jq3);
1459
1460     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1461     rcutoff_scalar   = fr->rcoulomb;
1462     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1463     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1464
1465     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1466     rvdw             = _mm_set1_ps(fr->rvdw);
1467
1468     /* Avoid stupid compiler warnings */
1469     jnrA = jnrB = jnrC = jnrD = 0;
1470     j_coord_offsetA = 0;
1471     j_coord_offsetB = 0;
1472     j_coord_offsetC = 0;
1473     j_coord_offsetD = 0;
1474
1475     outeriter        = 0;
1476     inneriter        = 0;
1477
1478     for(iidx=0;iidx<4*DIM;iidx++)
1479     {
1480         scratch[iidx] = 0.0;
1481     }
1482
1483     /* Start outer loop over neighborlists */
1484     for(iidx=0; iidx<nri; iidx++)
1485     {
1486         /* Load shift vector for this list */
1487         i_shift_offset   = DIM*shiftidx[iidx];
1488
1489         /* Load limits for loop over neighbors */
1490         j_index_start    = jindex[iidx];
1491         j_index_end      = jindex[iidx+1];
1492
1493         /* Get outer coordinate index */
1494         inr              = iinr[iidx];
1495         i_coord_offset   = DIM*inr;
1496
1497         /* Load i particle coords and add shift vector */
1498         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1499                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1500
1501         fix0             = _mm_setzero_ps();
1502         fiy0             = _mm_setzero_ps();
1503         fiz0             = _mm_setzero_ps();
1504         fix1             = _mm_setzero_ps();
1505         fiy1             = _mm_setzero_ps();
1506         fiz1             = _mm_setzero_ps();
1507         fix2             = _mm_setzero_ps();
1508         fiy2             = _mm_setzero_ps();
1509         fiz2             = _mm_setzero_ps();
1510         fix3             = _mm_setzero_ps();
1511         fiy3             = _mm_setzero_ps();
1512         fiz3             = _mm_setzero_ps();
1513
1514         /* Start inner kernel loop */
1515         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1516         {
1517
1518             /* Get j neighbor index, and coordinate index */
1519             jnrA             = jjnr[jidx];
1520             jnrB             = jjnr[jidx+1];
1521             jnrC             = jjnr[jidx+2];
1522             jnrD             = jjnr[jidx+3];
1523             j_coord_offsetA  = DIM*jnrA;
1524             j_coord_offsetB  = DIM*jnrB;
1525             j_coord_offsetC  = DIM*jnrC;
1526             j_coord_offsetD  = DIM*jnrD;
1527
1528             /* load j atom coordinates */
1529             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1530                                               x+j_coord_offsetC,x+j_coord_offsetD,
1531                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1532                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1533
1534             /* Calculate displacement vector */
1535             dx00             = _mm_sub_ps(ix0,jx0);
1536             dy00             = _mm_sub_ps(iy0,jy0);
1537             dz00             = _mm_sub_ps(iz0,jz0);
1538             dx11             = _mm_sub_ps(ix1,jx1);
1539             dy11             = _mm_sub_ps(iy1,jy1);
1540             dz11             = _mm_sub_ps(iz1,jz1);
1541             dx12             = _mm_sub_ps(ix1,jx2);
1542             dy12             = _mm_sub_ps(iy1,jy2);
1543             dz12             = _mm_sub_ps(iz1,jz2);
1544             dx13             = _mm_sub_ps(ix1,jx3);
1545             dy13             = _mm_sub_ps(iy1,jy3);
1546             dz13             = _mm_sub_ps(iz1,jz3);
1547             dx21             = _mm_sub_ps(ix2,jx1);
1548             dy21             = _mm_sub_ps(iy2,jy1);
1549             dz21             = _mm_sub_ps(iz2,jz1);
1550             dx22             = _mm_sub_ps(ix2,jx2);
1551             dy22             = _mm_sub_ps(iy2,jy2);
1552             dz22             = _mm_sub_ps(iz2,jz2);
1553             dx23             = _mm_sub_ps(ix2,jx3);
1554             dy23             = _mm_sub_ps(iy2,jy3);
1555             dz23             = _mm_sub_ps(iz2,jz3);
1556             dx31             = _mm_sub_ps(ix3,jx1);
1557             dy31             = _mm_sub_ps(iy3,jy1);
1558             dz31             = _mm_sub_ps(iz3,jz1);
1559             dx32             = _mm_sub_ps(ix3,jx2);
1560             dy32             = _mm_sub_ps(iy3,jy2);
1561             dz32             = _mm_sub_ps(iz3,jz2);
1562             dx33             = _mm_sub_ps(ix3,jx3);
1563             dy33             = _mm_sub_ps(iy3,jy3);
1564             dz33             = _mm_sub_ps(iz3,jz3);
1565
1566             /* Calculate squared distance and things based on it */
1567             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1568             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1569             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1570             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1571             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1572             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1573             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1574             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1575             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1576             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1577
1578             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1579             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1580             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1581             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1582             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1583             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1584             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1585             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1586             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1587
1588             rinvsq00         = gmx_mm_inv_ps(rsq00);
1589             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1590             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1591             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1592             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1593             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1594             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1595             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1596             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1597             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1598
1599             fjx0             = _mm_setzero_ps();
1600             fjy0             = _mm_setzero_ps();
1601             fjz0             = _mm_setzero_ps();
1602             fjx1             = _mm_setzero_ps();
1603             fjy1             = _mm_setzero_ps();
1604             fjz1             = _mm_setzero_ps();
1605             fjx2             = _mm_setzero_ps();
1606             fjy2             = _mm_setzero_ps();
1607             fjz2             = _mm_setzero_ps();
1608             fjx3             = _mm_setzero_ps();
1609             fjy3             = _mm_setzero_ps();
1610             fjz3             = _mm_setzero_ps();
1611
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             if (gmx_mm_any_lt(rsq00,rcutoff2))
1617             {
1618
1619             /* LENNARD-JONES DISPERSION/REPULSION */
1620
1621             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1622             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1623
1624             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1625
1626             fscal            = fvdw;
1627
1628             fscal            = _mm_and_ps(fscal,cutoff_mask);
1629
1630              /* Update vectorial force */
1631             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1632             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1633             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1634
1635             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1636             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1637             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1638
1639             }
1640
1641             /**************************
1642              * CALCULATE INTERACTIONS *
1643              **************************/
1644
1645             if (gmx_mm_any_lt(rsq11,rcutoff2))
1646             {
1647
1648             r11              = _mm_mul_ps(rsq11,rinv11);
1649
1650             /* EWALD ELECTROSTATICS */
1651
1652             /* Analytical PME correction */
1653             zeta2            = _mm_mul_ps(beta2,rsq11);
1654             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1655             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1656             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1657             felec            = _mm_mul_ps(qq11,felec);
1658
1659             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1660
1661             fscal            = felec;
1662
1663             fscal            = _mm_and_ps(fscal,cutoff_mask);
1664
1665              /* Update vectorial force */
1666             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1667             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1668             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1669
1670             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1671             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1672             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1673
1674             }
1675
1676             /**************************
1677              * CALCULATE INTERACTIONS *
1678              **************************/
1679
1680             if (gmx_mm_any_lt(rsq12,rcutoff2))
1681             {
1682
1683             r12              = _mm_mul_ps(rsq12,rinv12);
1684
1685             /* EWALD ELECTROSTATICS */
1686
1687             /* Analytical PME correction */
1688             zeta2            = _mm_mul_ps(beta2,rsq12);
1689             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1690             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1691             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1692             felec            = _mm_mul_ps(qq12,felec);
1693
1694             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1695
1696             fscal            = felec;
1697
1698             fscal            = _mm_and_ps(fscal,cutoff_mask);
1699
1700              /* Update vectorial force */
1701             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1702             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1703             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1704
1705             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1706             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1707             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1708
1709             }
1710
1711             /**************************
1712              * CALCULATE INTERACTIONS *
1713              **************************/
1714
1715             if (gmx_mm_any_lt(rsq13,rcutoff2))
1716             {
1717
1718             r13              = _mm_mul_ps(rsq13,rinv13);
1719
1720             /* EWALD ELECTROSTATICS */
1721
1722             /* Analytical PME correction */
1723             zeta2            = _mm_mul_ps(beta2,rsq13);
1724             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1725             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1726             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1727             felec            = _mm_mul_ps(qq13,felec);
1728
1729             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1730
1731             fscal            = felec;
1732
1733             fscal            = _mm_and_ps(fscal,cutoff_mask);
1734
1735              /* Update vectorial force */
1736             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1737             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1738             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1739
1740             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1741             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1742             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1743
1744             }
1745
1746             /**************************
1747              * CALCULATE INTERACTIONS *
1748              **************************/
1749
1750             if (gmx_mm_any_lt(rsq21,rcutoff2))
1751             {
1752
1753             r21              = _mm_mul_ps(rsq21,rinv21);
1754
1755             /* EWALD ELECTROSTATICS */
1756
1757             /* Analytical PME correction */
1758             zeta2            = _mm_mul_ps(beta2,rsq21);
1759             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1760             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1761             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1762             felec            = _mm_mul_ps(qq21,felec);
1763
1764             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1765
1766             fscal            = felec;
1767
1768             fscal            = _mm_and_ps(fscal,cutoff_mask);
1769
1770              /* Update vectorial force */
1771             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1772             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1773             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1774
1775             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1776             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1777             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1778
1779             }
1780
1781             /**************************
1782              * CALCULATE INTERACTIONS *
1783              **************************/
1784
1785             if (gmx_mm_any_lt(rsq22,rcutoff2))
1786             {
1787
1788             r22              = _mm_mul_ps(rsq22,rinv22);
1789
1790             /* EWALD ELECTROSTATICS */
1791
1792             /* Analytical PME correction */
1793             zeta2            = _mm_mul_ps(beta2,rsq22);
1794             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1795             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1796             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1797             felec            = _mm_mul_ps(qq22,felec);
1798
1799             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1800
1801             fscal            = felec;
1802
1803             fscal            = _mm_and_ps(fscal,cutoff_mask);
1804
1805              /* Update vectorial force */
1806             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1807             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1808             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1809
1810             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1811             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1812             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1813
1814             }
1815
1816             /**************************
1817              * CALCULATE INTERACTIONS *
1818              **************************/
1819
1820             if (gmx_mm_any_lt(rsq23,rcutoff2))
1821             {
1822
1823             r23              = _mm_mul_ps(rsq23,rinv23);
1824
1825             /* EWALD ELECTROSTATICS */
1826
1827             /* Analytical PME correction */
1828             zeta2            = _mm_mul_ps(beta2,rsq23);
1829             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1830             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1831             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1832             felec            = _mm_mul_ps(qq23,felec);
1833
1834             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1835
1836             fscal            = felec;
1837
1838             fscal            = _mm_and_ps(fscal,cutoff_mask);
1839
1840              /* Update vectorial force */
1841             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1842             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1843             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1844
1845             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1846             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1847             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1848
1849             }
1850
1851             /**************************
1852              * CALCULATE INTERACTIONS *
1853              **************************/
1854
1855             if (gmx_mm_any_lt(rsq31,rcutoff2))
1856             {
1857
1858             r31              = _mm_mul_ps(rsq31,rinv31);
1859
1860             /* EWALD ELECTROSTATICS */
1861
1862             /* Analytical PME correction */
1863             zeta2            = _mm_mul_ps(beta2,rsq31);
1864             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1865             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1866             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1867             felec            = _mm_mul_ps(qq31,felec);
1868
1869             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1870
1871             fscal            = felec;
1872
1873             fscal            = _mm_and_ps(fscal,cutoff_mask);
1874
1875              /* Update vectorial force */
1876             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1877             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1878             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1879
1880             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1881             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1882             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1883
1884             }
1885
1886             /**************************
1887              * CALCULATE INTERACTIONS *
1888              **************************/
1889
1890             if (gmx_mm_any_lt(rsq32,rcutoff2))
1891             {
1892
1893             r32              = _mm_mul_ps(rsq32,rinv32);
1894
1895             /* EWALD ELECTROSTATICS */
1896
1897             /* Analytical PME correction */
1898             zeta2            = _mm_mul_ps(beta2,rsq32);
1899             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1900             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1901             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1902             felec            = _mm_mul_ps(qq32,felec);
1903
1904             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1905
1906             fscal            = felec;
1907
1908             fscal            = _mm_and_ps(fscal,cutoff_mask);
1909
1910              /* Update vectorial force */
1911             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1912             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1913             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1914
1915             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1916             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1917             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1918
1919             }
1920
1921             /**************************
1922              * CALCULATE INTERACTIONS *
1923              **************************/
1924
1925             if (gmx_mm_any_lt(rsq33,rcutoff2))
1926             {
1927
1928             r33              = _mm_mul_ps(rsq33,rinv33);
1929
1930             /* EWALD ELECTROSTATICS */
1931
1932             /* Analytical PME correction */
1933             zeta2            = _mm_mul_ps(beta2,rsq33);
1934             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1935             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1936             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1937             felec            = _mm_mul_ps(qq33,felec);
1938
1939             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1940
1941             fscal            = felec;
1942
1943             fscal            = _mm_and_ps(fscal,cutoff_mask);
1944
1945              /* Update vectorial force */
1946             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1947             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1948             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1949
1950             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1951             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1952             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1953
1954             }
1955
1956             fjptrA             = f+j_coord_offsetA;
1957             fjptrB             = f+j_coord_offsetB;
1958             fjptrC             = f+j_coord_offsetC;
1959             fjptrD             = f+j_coord_offsetD;
1960
1961             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1962                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1963                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1964
1965             /* Inner loop uses 315 flops */
1966         }
1967
1968         if(jidx<j_index_end)
1969         {
1970
1971             /* Get j neighbor index, and coordinate index */
1972             jnrlistA         = jjnr[jidx];
1973             jnrlistB         = jjnr[jidx+1];
1974             jnrlistC         = jjnr[jidx+2];
1975             jnrlistD         = jjnr[jidx+3];
1976             /* Sign of each element will be negative for non-real atoms.
1977              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1978              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1979              */
1980             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1981             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1982             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1983             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1984             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1985             j_coord_offsetA  = DIM*jnrA;
1986             j_coord_offsetB  = DIM*jnrB;
1987             j_coord_offsetC  = DIM*jnrC;
1988             j_coord_offsetD  = DIM*jnrD;
1989
1990             /* load j atom coordinates */
1991             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1992                                               x+j_coord_offsetC,x+j_coord_offsetD,
1993                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1994                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1995
1996             /* Calculate displacement vector */
1997             dx00             = _mm_sub_ps(ix0,jx0);
1998             dy00             = _mm_sub_ps(iy0,jy0);
1999             dz00             = _mm_sub_ps(iz0,jz0);
2000             dx11             = _mm_sub_ps(ix1,jx1);
2001             dy11             = _mm_sub_ps(iy1,jy1);
2002             dz11             = _mm_sub_ps(iz1,jz1);
2003             dx12             = _mm_sub_ps(ix1,jx2);
2004             dy12             = _mm_sub_ps(iy1,jy2);
2005             dz12             = _mm_sub_ps(iz1,jz2);
2006             dx13             = _mm_sub_ps(ix1,jx3);
2007             dy13             = _mm_sub_ps(iy1,jy3);
2008             dz13             = _mm_sub_ps(iz1,jz3);
2009             dx21             = _mm_sub_ps(ix2,jx1);
2010             dy21             = _mm_sub_ps(iy2,jy1);
2011             dz21             = _mm_sub_ps(iz2,jz1);
2012             dx22             = _mm_sub_ps(ix2,jx2);
2013             dy22             = _mm_sub_ps(iy2,jy2);
2014             dz22             = _mm_sub_ps(iz2,jz2);
2015             dx23             = _mm_sub_ps(ix2,jx3);
2016             dy23             = _mm_sub_ps(iy2,jy3);
2017             dz23             = _mm_sub_ps(iz2,jz3);
2018             dx31             = _mm_sub_ps(ix3,jx1);
2019             dy31             = _mm_sub_ps(iy3,jy1);
2020             dz31             = _mm_sub_ps(iz3,jz1);
2021             dx32             = _mm_sub_ps(ix3,jx2);
2022             dy32             = _mm_sub_ps(iy3,jy2);
2023             dz32             = _mm_sub_ps(iz3,jz2);
2024             dx33             = _mm_sub_ps(ix3,jx3);
2025             dy33             = _mm_sub_ps(iy3,jy3);
2026             dz33             = _mm_sub_ps(iz3,jz3);
2027
2028             /* Calculate squared distance and things based on it */
2029             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2030             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2031             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2032             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2033             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2034             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2035             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2036             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2037             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2038             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2039
2040             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2041             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2042             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2043             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2044             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2045             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2046             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2047             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2048             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2049
2050             rinvsq00         = gmx_mm_inv_ps(rsq00);
2051             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2052             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2053             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2054             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2055             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2056             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2057             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2058             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2059             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2060
2061             fjx0             = _mm_setzero_ps();
2062             fjy0             = _mm_setzero_ps();
2063             fjz0             = _mm_setzero_ps();
2064             fjx1             = _mm_setzero_ps();
2065             fjy1             = _mm_setzero_ps();
2066             fjz1             = _mm_setzero_ps();
2067             fjx2             = _mm_setzero_ps();
2068             fjy2             = _mm_setzero_ps();
2069             fjz2             = _mm_setzero_ps();
2070             fjx3             = _mm_setzero_ps();
2071             fjy3             = _mm_setzero_ps();
2072             fjz3             = _mm_setzero_ps();
2073
2074             /**************************
2075              * CALCULATE INTERACTIONS *
2076              **************************/
2077
2078             if (gmx_mm_any_lt(rsq00,rcutoff2))
2079             {
2080
2081             /* LENNARD-JONES DISPERSION/REPULSION */
2082
2083             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2084             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2085
2086             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2087
2088             fscal            = fvdw;
2089
2090             fscal            = _mm_and_ps(fscal,cutoff_mask);
2091
2092             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2093
2094              /* Update vectorial force */
2095             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2096             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2097             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2098
2099             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2100             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2101             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2102
2103             }
2104
2105             /**************************
2106              * CALCULATE INTERACTIONS *
2107              **************************/
2108
2109             if (gmx_mm_any_lt(rsq11,rcutoff2))
2110             {
2111
2112             r11              = _mm_mul_ps(rsq11,rinv11);
2113             r11              = _mm_andnot_ps(dummy_mask,r11);
2114
2115             /* EWALD ELECTROSTATICS */
2116
2117             /* Analytical PME correction */
2118             zeta2            = _mm_mul_ps(beta2,rsq11);
2119             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
2120             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2121             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2122             felec            = _mm_mul_ps(qq11,felec);
2123
2124             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2125
2126             fscal            = felec;
2127
2128             fscal            = _mm_and_ps(fscal,cutoff_mask);
2129
2130             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2131
2132              /* Update vectorial force */
2133             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2134             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2135             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2136
2137             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2138             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2139             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2140
2141             }
2142
2143             /**************************
2144              * CALCULATE INTERACTIONS *
2145              **************************/
2146
2147             if (gmx_mm_any_lt(rsq12,rcutoff2))
2148             {
2149
2150             r12              = _mm_mul_ps(rsq12,rinv12);
2151             r12              = _mm_andnot_ps(dummy_mask,r12);
2152
2153             /* EWALD ELECTROSTATICS */
2154
2155             /* Analytical PME correction */
2156             zeta2            = _mm_mul_ps(beta2,rsq12);
2157             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
2158             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2159             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2160             felec            = _mm_mul_ps(qq12,felec);
2161
2162             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2163
2164             fscal            = felec;
2165
2166             fscal            = _mm_and_ps(fscal,cutoff_mask);
2167
2168             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2169
2170              /* Update vectorial force */
2171             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2172             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2173             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2174
2175             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2176             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2177             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2178
2179             }
2180
2181             /**************************
2182              * CALCULATE INTERACTIONS *
2183              **************************/
2184
2185             if (gmx_mm_any_lt(rsq13,rcutoff2))
2186             {
2187
2188             r13              = _mm_mul_ps(rsq13,rinv13);
2189             r13              = _mm_andnot_ps(dummy_mask,r13);
2190
2191             /* EWALD ELECTROSTATICS */
2192
2193             /* Analytical PME correction */
2194             zeta2            = _mm_mul_ps(beta2,rsq13);
2195             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
2196             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2197             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2198             felec            = _mm_mul_ps(qq13,felec);
2199
2200             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2201
2202             fscal            = felec;
2203
2204             fscal            = _mm_and_ps(fscal,cutoff_mask);
2205
2206             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2207
2208              /* Update vectorial force */
2209             fix1             = _mm_macc_ps(dx13,fscal,fix1);
2210             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
2211             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
2212
2213             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2214             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2215             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2216
2217             }
2218
2219             /**************************
2220              * CALCULATE INTERACTIONS *
2221              **************************/
2222
2223             if (gmx_mm_any_lt(rsq21,rcutoff2))
2224             {
2225
2226             r21              = _mm_mul_ps(rsq21,rinv21);
2227             r21              = _mm_andnot_ps(dummy_mask,r21);
2228
2229             /* EWALD ELECTROSTATICS */
2230
2231             /* Analytical PME correction */
2232             zeta2            = _mm_mul_ps(beta2,rsq21);
2233             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2234             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2235             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2236             felec            = _mm_mul_ps(qq21,felec);
2237
2238             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2239
2240             fscal            = felec;
2241
2242             fscal            = _mm_and_ps(fscal,cutoff_mask);
2243
2244             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2245
2246              /* Update vectorial force */
2247             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2248             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2249             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2250
2251             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2252             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2253             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2254
2255             }
2256
2257             /**************************
2258              * CALCULATE INTERACTIONS *
2259              **************************/
2260
2261             if (gmx_mm_any_lt(rsq22,rcutoff2))
2262             {
2263
2264             r22              = _mm_mul_ps(rsq22,rinv22);
2265             r22              = _mm_andnot_ps(dummy_mask,r22);
2266
2267             /* EWALD ELECTROSTATICS */
2268
2269             /* Analytical PME correction */
2270             zeta2            = _mm_mul_ps(beta2,rsq22);
2271             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2272             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2273             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2274             felec            = _mm_mul_ps(qq22,felec);
2275
2276             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2277
2278             fscal            = felec;
2279
2280             fscal            = _mm_and_ps(fscal,cutoff_mask);
2281
2282             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2283
2284              /* Update vectorial force */
2285             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2286             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2287             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2288
2289             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2290             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2291             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2292
2293             }
2294
2295             /**************************
2296              * CALCULATE INTERACTIONS *
2297              **************************/
2298
2299             if (gmx_mm_any_lt(rsq23,rcutoff2))
2300             {
2301
2302             r23              = _mm_mul_ps(rsq23,rinv23);
2303             r23              = _mm_andnot_ps(dummy_mask,r23);
2304
2305             /* EWALD ELECTROSTATICS */
2306
2307             /* Analytical PME correction */
2308             zeta2            = _mm_mul_ps(beta2,rsq23);
2309             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
2310             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2311             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2312             felec            = _mm_mul_ps(qq23,felec);
2313
2314             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2315
2316             fscal            = felec;
2317
2318             fscal            = _mm_and_ps(fscal,cutoff_mask);
2319
2320             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2321
2322              /* Update vectorial force */
2323             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2324             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2325             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2326
2327             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2328             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2329             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2330
2331             }
2332
2333             /**************************
2334              * CALCULATE INTERACTIONS *
2335              **************************/
2336
2337             if (gmx_mm_any_lt(rsq31,rcutoff2))
2338             {
2339
2340             r31              = _mm_mul_ps(rsq31,rinv31);
2341             r31              = _mm_andnot_ps(dummy_mask,r31);
2342
2343             /* EWALD ELECTROSTATICS */
2344
2345             /* Analytical PME correction */
2346             zeta2            = _mm_mul_ps(beta2,rsq31);
2347             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
2348             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2349             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2350             felec            = _mm_mul_ps(qq31,felec);
2351
2352             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2353
2354             fscal            = felec;
2355
2356             fscal            = _mm_and_ps(fscal,cutoff_mask);
2357
2358             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2359
2360              /* Update vectorial force */
2361             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2362             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2363             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2364
2365             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2366             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2367             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2368
2369             }
2370
2371             /**************************
2372              * CALCULATE INTERACTIONS *
2373              **************************/
2374
2375             if (gmx_mm_any_lt(rsq32,rcutoff2))
2376             {
2377
2378             r32              = _mm_mul_ps(rsq32,rinv32);
2379             r32              = _mm_andnot_ps(dummy_mask,r32);
2380
2381             /* EWALD ELECTROSTATICS */
2382
2383             /* Analytical PME correction */
2384             zeta2            = _mm_mul_ps(beta2,rsq32);
2385             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
2386             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2387             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2388             felec            = _mm_mul_ps(qq32,felec);
2389
2390             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2391
2392             fscal            = felec;
2393
2394             fscal            = _mm_and_ps(fscal,cutoff_mask);
2395
2396             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2397
2398              /* Update vectorial force */
2399             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2400             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2401             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2402
2403             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2404             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2405             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2406
2407             }
2408
2409             /**************************
2410              * CALCULATE INTERACTIONS *
2411              **************************/
2412
2413             if (gmx_mm_any_lt(rsq33,rcutoff2))
2414             {
2415
2416             r33              = _mm_mul_ps(rsq33,rinv33);
2417             r33              = _mm_andnot_ps(dummy_mask,r33);
2418
2419             /* EWALD ELECTROSTATICS */
2420
2421             /* Analytical PME correction */
2422             zeta2            = _mm_mul_ps(beta2,rsq33);
2423             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
2424             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2425             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2426             felec            = _mm_mul_ps(qq33,felec);
2427
2428             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2429
2430             fscal            = felec;
2431
2432             fscal            = _mm_and_ps(fscal,cutoff_mask);
2433
2434             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2435
2436              /* Update vectorial force */
2437             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2438             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2439             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2440
2441             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2442             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2443             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2444
2445             }
2446
2447             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2448             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2449             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2450             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2451
2452             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2453                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2454                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2455
2456             /* Inner loop uses 324 flops */
2457         }
2458
2459         /* End of innermost loop */
2460
2461         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2462                                               f+i_coord_offset,fshift+i_shift_offset);
2463
2464         /* Increment number of inner iterations */
2465         inneriter                  += j_index_end - j_index_start;
2466
2467         /* Outer loop uses 24 flops */
2468     }
2469
2470     /* Increment number of outer iterations */
2471     outeriter        += nri;
2472
2473     /* Update outer/inner flops */
2474
2475     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*324);
2476 }