88b85456961424b509cb5055d12403da975435b3
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCoul_VdwLJ_GeomW4W4_avx_128_fma_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single
38  * Electrostatics interaction: Coulomb
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwioffset3;
76     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
96     real             *charge;
97     int              nvdwtype;
98     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99     int              *vdwtype;
100     real             *vdwparam;
101     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
102     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
103     __m128           dummy_mask,cutoff_mask;
104     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
105     __m128           one     = _mm_set1_ps(1.0);
106     __m128           two     = _mm_set1_ps(2.0);
107     x                = xx[0];
108     f                = ff[0];
109
110     nri              = nlist->nri;
111     iinr             = nlist->iinr;
112     jindex           = nlist->jindex;
113     jjnr             = nlist->jjnr;
114     shiftidx         = nlist->shift;
115     gid              = nlist->gid;
116     shiftvec         = fr->shift_vec[0];
117     fshift           = fr->fshift[0];
118     facel            = _mm_set1_ps(fr->epsfac);
119     charge           = mdatoms->chargeA;
120     nvdwtype         = fr->ntype;
121     vdwparam         = fr->nbfp;
122     vdwtype          = mdatoms->typeA;
123
124     /* Setup water-specific parameters */
125     inr              = nlist->iinr[0];
126     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
127     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
128     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
129     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
130
131     jq1              = _mm_set1_ps(charge[inr+1]);
132     jq2              = _mm_set1_ps(charge[inr+2]);
133     jq3              = _mm_set1_ps(charge[inr+3]);
134     vdwjidx0A        = 2*vdwtype[inr+0];
135     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
136     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
137     qq11             = _mm_mul_ps(iq1,jq1);
138     qq12             = _mm_mul_ps(iq1,jq2);
139     qq13             = _mm_mul_ps(iq1,jq3);
140     qq21             = _mm_mul_ps(iq2,jq1);
141     qq22             = _mm_mul_ps(iq2,jq2);
142     qq23             = _mm_mul_ps(iq2,jq3);
143     qq31             = _mm_mul_ps(iq3,jq1);
144     qq32             = _mm_mul_ps(iq3,jq2);
145     qq33             = _mm_mul_ps(iq3,jq3);
146
147     /* Avoid stupid compiler warnings */
148     jnrA = jnrB = jnrC = jnrD = 0;
149     j_coord_offsetA = 0;
150     j_coord_offsetB = 0;
151     j_coord_offsetC = 0;
152     j_coord_offsetD = 0;
153
154     outeriter        = 0;
155     inneriter        = 0;
156
157     for(iidx=0;iidx<4*DIM;iidx++)
158     {
159         scratch[iidx] = 0.0;
160     }
161
162     /* Start outer loop over neighborlists */
163     for(iidx=0; iidx<nri; iidx++)
164     {
165         /* Load shift vector for this list */
166         i_shift_offset   = DIM*shiftidx[iidx];
167
168         /* Load limits for loop over neighbors */
169         j_index_start    = jindex[iidx];
170         j_index_end      = jindex[iidx+1];
171
172         /* Get outer coordinate index */
173         inr              = iinr[iidx];
174         i_coord_offset   = DIM*inr;
175
176         /* Load i particle coords and add shift vector */
177         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
178                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
179
180         fix0             = _mm_setzero_ps();
181         fiy0             = _mm_setzero_ps();
182         fiz0             = _mm_setzero_ps();
183         fix1             = _mm_setzero_ps();
184         fiy1             = _mm_setzero_ps();
185         fiz1             = _mm_setzero_ps();
186         fix2             = _mm_setzero_ps();
187         fiy2             = _mm_setzero_ps();
188         fiz2             = _mm_setzero_ps();
189         fix3             = _mm_setzero_ps();
190         fiy3             = _mm_setzero_ps();
191         fiz3             = _mm_setzero_ps();
192
193         /* Reset potential sums */
194         velecsum         = _mm_setzero_ps();
195         vvdwsum          = _mm_setzero_ps();
196
197         /* Start inner kernel loop */
198         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
199         {
200
201             /* Get j neighbor index, and coordinate index */
202             jnrA             = jjnr[jidx];
203             jnrB             = jjnr[jidx+1];
204             jnrC             = jjnr[jidx+2];
205             jnrD             = jjnr[jidx+3];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208             j_coord_offsetC  = DIM*jnrC;
209             j_coord_offsetD  = DIM*jnrD;
210
211             /* load j atom coordinates */
212             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
213                                               x+j_coord_offsetC,x+j_coord_offsetD,
214                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
215                                               &jy2,&jz2,&jx3,&jy3,&jz3);
216
217             /* Calculate displacement vector */
218             dx00             = _mm_sub_ps(ix0,jx0);
219             dy00             = _mm_sub_ps(iy0,jy0);
220             dz00             = _mm_sub_ps(iz0,jz0);
221             dx11             = _mm_sub_ps(ix1,jx1);
222             dy11             = _mm_sub_ps(iy1,jy1);
223             dz11             = _mm_sub_ps(iz1,jz1);
224             dx12             = _mm_sub_ps(ix1,jx2);
225             dy12             = _mm_sub_ps(iy1,jy2);
226             dz12             = _mm_sub_ps(iz1,jz2);
227             dx13             = _mm_sub_ps(ix1,jx3);
228             dy13             = _mm_sub_ps(iy1,jy3);
229             dz13             = _mm_sub_ps(iz1,jz3);
230             dx21             = _mm_sub_ps(ix2,jx1);
231             dy21             = _mm_sub_ps(iy2,jy1);
232             dz21             = _mm_sub_ps(iz2,jz1);
233             dx22             = _mm_sub_ps(ix2,jx2);
234             dy22             = _mm_sub_ps(iy2,jy2);
235             dz22             = _mm_sub_ps(iz2,jz2);
236             dx23             = _mm_sub_ps(ix2,jx3);
237             dy23             = _mm_sub_ps(iy2,jy3);
238             dz23             = _mm_sub_ps(iz2,jz3);
239             dx31             = _mm_sub_ps(ix3,jx1);
240             dy31             = _mm_sub_ps(iy3,jy1);
241             dz31             = _mm_sub_ps(iz3,jz1);
242             dx32             = _mm_sub_ps(ix3,jx2);
243             dy32             = _mm_sub_ps(iy3,jy2);
244             dz32             = _mm_sub_ps(iz3,jz2);
245             dx33             = _mm_sub_ps(ix3,jx3);
246             dy33             = _mm_sub_ps(iy3,jy3);
247             dz33             = _mm_sub_ps(iz3,jz3);
248
249             /* Calculate squared distance and things based on it */
250             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
251             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
252             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
253             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
254             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
255             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
256             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
257             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
258             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
259             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
260
261             rinv11           = gmx_mm_invsqrt_ps(rsq11);
262             rinv12           = gmx_mm_invsqrt_ps(rsq12);
263             rinv13           = gmx_mm_invsqrt_ps(rsq13);
264             rinv21           = gmx_mm_invsqrt_ps(rsq21);
265             rinv22           = gmx_mm_invsqrt_ps(rsq22);
266             rinv23           = gmx_mm_invsqrt_ps(rsq23);
267             rinv31           = gmx_mm_invsqrt_ps(rsq31);
268             rinv32           = gmx_mm_invsqrt_ps(rsq32);
269             rinv33           = gmx_mm_invsqrt_ps(rsq33);
270
271             rinvsq00         = gmx_mm_inv_ps(rsq00);
272             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
273             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
274             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
275             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
276             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
277             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
278             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
279             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
280             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
281
282             fjx0             = _mm_setzero_ps();
283             fjy0             = _mm_setzero_ps();
284             fjz0             = _mm_setzero_ps();
285             fjx1             = _mm_setzero_ps();
286             fjy1             = _mm_setzero_ps();
287             fjz1             = _mm_setzero_ps();
288             fjx2             = _mm_setzero_ps();
289             fjy2             = _mm_setzero_ps();
290             fjz2             = _mm_setzero_ps();
291             fjx3             = _mm_setzero_ps();
292             fjy3             = _mm_setzero_ps();
293             fjz3             = _mm_setzero_ps();
294
295             /**************************
296              * CALCULATE INTERACTIONS *
297              **************************/
298
299             /* LENNARD-JONES DISPERSION/REPULSION */
300
301             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
302             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
303             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
304             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
305             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
306
307             /* Update potential sum for this i atom from the interaction with this j atom. */
308             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
309
310             fscal            = fvdw;
311
312              /* Update vectorial force */
313             fix0             = _mm_macc_ps(dx00,fscal,fix0);
314             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
315             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
316
317             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
318             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
319             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
320
321             /**************************
322              * CALCULATE INTERACTIONS *
323              **************************/
324
325             /* COULOMB ELECTROSTATICS */
326             velec            = _mm_mul_ps(qq11,rinv11);
327             felec            = _mm_mul_ps(velec,rinvsq11);
328
329             /* Update potential sum for this i atom from the interaction with this j atom. */
330             velecsum         = _mm_add_ps(velecsum,velec);
331
332             fscal            = felec;
333
334              /* Update vectorial force */
335             fix1             = _mm_macc_ps(dx11,fscal,fix1);
336             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
337             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
338
339             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
340             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
341             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
342
343             /**************************
344              * CALCULATE INTERACTIONS *
345              **************************/
346
347             /* COULOMB ELECTROSTATICS */
348             velec            = _mm_mul_ps(qq12,rinv12);
349             felec            = _mm_mul_ps(velec,rinvsq12);
350
351             /* Update potential sum for this i atom from the interaction with this j atom. */
352             velecsum         = _mm_add_ps(velecsum,velec);
353
354             fscal            = felec;
355
356              /* Update vectorial force */
357             fix1             = _mm_macc_ps(dx12,fscal,fix1);
358             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
359             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
360
361             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
362             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
363             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
364
365             /**************************
366              * CALCULATE INTERACTIONS *
367              **************************/
368
369             /* COULOMB ELECTROSTATICS */
370             velec            = _mm_mul_ps(qq13,rinv13);
371             felec            = _mm_mul_ps(velec,rinvsq13);
372
373             /* Update potential sum for this i atom from the interaction with this j atom. */
374             velecsum         = _mm_add_ps(velecsum,velec);
375
376             fscal            = felec;
377
378              /* Update vectorial force */
379             fix1             = _mm_macc_ps(dx13,fscal,fix1);
380             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
381             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
382
383             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
384             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
385             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
386
387             /**************************
388              * CALCULATE INTERACTIONS *
389              **************************/
390
391             /* COULOMB ELECTROSTATICS */
392             velec            = _mm_mul_ps(qq21,rinv21);
393             felec            = _mm_mul_ps(velec,rinvsq21);
394
395             /* Update potential sum for this i atom from the interaction with this j atom. */
396             velecsum         = _mm_add_ps(velecsum,velec);
397
398             fscal            = felec;
399
400              /* Update vectorial force */
401             fix2             = _mm_macc_ps(dx21,fscal,fix2);
402             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
403             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
404
405             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
406             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
407             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
408
409             /**************************
410              * CALCULATE INTERACTIONS *
411              **************************/
412
413             /* COULOMB ELECTROSTATICS */
414             velec            = _mm_mul_ps(qq22,rinv22);
415             felec            = _mm_mul_ps(velec,rinvsq22);
416
417             /* Update potential sum for this i atom from the interaction with this j atom. */
418             velecsum         = _mm_add_ps(velecsum,velec);
419
420             fscal            = felec;
421
422              /* Update vectorial force */
423             fix2             = _mm_macc_ps(dx22,fscal,fix2);
424             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
425             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
426
427             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
428             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
429             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
430
431             /**************************
432              * CALCULATE INTERACTIONS *
433              **************************/
434
435             /* COULOMB ELECTROSTATICS */
436             velec            = _mm_mul_ps(qq23,rinv23);
437             felec            = _mm_mul_ps(velec,rinvsq23);
438
439             /* Update potential sum for this i atom from the interaction with this j atom. */
440             velecsum         = _mm_add_ps(velecsum,velec);
441
442             fscal            = felec;
443
444              /* Update vectorial force */
445             fix2             = _mm_macc_ps(dx23,fscal,fix2);
446             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
447             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
448
449             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
450             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
451             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
452
453             /**************************
454              * CALCULATE INTERACTIONS *
455              **************************/
456
457             /* COULOMB ELECTROSTATICS */
458             velec            = _mm_mul_ps(qq31,rinv31);
459             felec            = _mm_mul_ps(velec,rinvsq31);
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velecsum         = _mm_add_ps(velecsum,velec);
463
464             fscal            = felec;
465
466              /* Update vectorial force */
467             fix3             = _mm_macc_ps(dx31,fscal,fix3);
468             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
469             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
470
471             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
472             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
473             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
474
475             /**************************
476              * CALCULATE INTERACTIONS *
477              **************************/
478
479             /* COULOMB ELECTROSTATICS */
480             velec            = _mm_mul_ps(qq32,rinv32);
481             felec            = _mm_mul_ps(velec,rinvsq32);
482
483             /* Update potential sum for this i atom from the interaction with this j atom. */
484             velecsum         = _mm_add_ps(velecsum,velec);
485
486             fscal            = felec;
487
488              /* Update vectorial force */
489             fix3             = _mm_macc_ps(dx32,fscal,fix3);
490             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
491             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
492
493             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
494             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
495             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
496
497             /**************************
498              * CALCULATE INTERACTIONS *
499              **************************/
500
501             /* COULOMB ELECTROSTATICS */
502             velec            = _mm_mul_ps(qq33,rinv33);
503             felec            = _mm_mul_ps(velec,rinvsq33);
504
505             /* Update potential sum for this i atom from the interaction with this j atom. */
506             velecsum         = _mm_add_ps(velecsum,velec);
507
508             fscal            = felec;
509
510              /* Update vectorial force */
511             fix3             = _mm_macc_ps(dx33,fscal,fix3);
512             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
513             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
514
515             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
516             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
517             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
518
519             fjptrA             = f+j_coord_offsetA;
520             fjptrB             = f+j_coord_offsetB;
521             fjptrC             = f+j_coord_offsetC;
522             fjptrD             = f+j_coord_offsetD;
523
524             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
525                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
526                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
527
528             /* Inner loop uses 317 flops */
529         }
530
531         if(jidx<j_index_end)
532         {
533
534             /* Get j neighbor index, and coordinate index */
535             jnrlistA         = jjnr[jidx];
536             jnrlistB         = jjnr[jidx+1];
537             jnrlistC         = jjnr[jidx+2];
538             jnrlistD         = jjnr[jidx+3];
539             /* Sign of each element will be negative for non-real atoms.
540              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
541              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
542              */
543             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
544             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
545             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
546             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
547             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
548             j_coord_offsetA  = DIM*jnrA;
549             j_coord_offsetB  = DIM*jnrB;
550             j_coord_offsetC  = DIM*jnrC;
551             j_coord_offsetD  = DIM*jnrD;
552
553             /* load j atom coordinates */
554             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
555                                               x+j_coord_offsetC,x+j_coord_offsetD,
556                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
557                                               &jy2,&jz2,&jx3,&jy3,&jz3);
558
559             /* Calculate displacement vector */
560             dx00             = _mm_sub_ps(ix0,jx0);
561             dy00             = _mm_sub_ps(iy0,jy0);
562             dz00             = _mm_sub_ps(iz0,jz0);
563             dx11             = _mm_sub_ps(ix1,jx1);
564             dy11             = _mm_sub_ps(iy1,jy1);
565             dz11             = _mm_sub_ps(iz1,jz1);
566             dx12             = _mm_sub_ps(ix1,jx2);
567             dy12             = _mm_sub_ps(iy1,jy2);
568             dz12             = _mm_sub_ps(iz1,jz2);
569             dx13             = _mm_sub_ps(ix1,jx3);
570             dy13             = _mm_sub_ps(iy1,jy3);
571             dz13             = _mm_sub_ps(iz1,jz3);
572             dx21             = _mm_sub_ps(ix2,jx1);
573             dy21             = _mm_sub_ps(iy2,jy1);
574             dz21             = _mm_sub_ps(iz2,jz1);
575             dx22             = _mm_sub_ps(ix2,jx2);
576             dy22             = _mm_sub_ps(iy2,jy2);
577             dz22             = _mm_sub_ps(iz2,jz2);
578             dx23             = _mm_sub_ps(ix2,jx3);
579             dy23             = _mm_sub_ps(iy2,jy3);
580             dz23             = _mm_sub_ps(iz2,jz3);
581             dx31             = _mm_sub_ps(ix3,jx1);
582             dy31             = _mm_sub_ps(iy3,jy1);
583             dz31             = _mm_sub_ps(iz3,jz1);
584             dx32             = _mm_sub_ps(ix3,jx2);
585             dy32             = _mm_sub_ps(iy3,jy2);
586             dz32             = _mm_sub_ps(iz3,jz2);
587             dx33             = _mm_sub_ps(ix3,jx3);
588             dy33             = _mm_sub_ps(iy3,jy3);
589             dz33             = _mm_sub_ps(iz3,jz3);
590
591             /* Calculate squared distance and things based on it */
592             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
593             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
594             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
595             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
596             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
597             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
598             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
599             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
600             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
601             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
602
603             rinv11           = gmx_mm_invsqrt_ps(rsq11);
604             rinv12           = gmx_mm_invsqrt_ps(rsq12);
605             rinv13           = gmx_mm_invsqrt_ps(rsq13);
606             rinv21           = gmx_mm_invsqrt_ps(rsq21);
607             rinv22           = gmx_mm_invsqrt_ps(rsq22);
608             rinv23           = gmx_mm_invsqrt_ps(rsq23);
609             rinv31           = gmx_mm_invsqrt_ps(rsq31);
610             rinv32           = gmx_mm_invsqrt_ps(rsq32);
611             rinv33           = gmx_mm_invsqrt_ps(rsq33);
612
613             rinvsq00         = gmx_mm_inv_ps(rsq00);
614             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
615             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
616             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
617             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
618             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
619             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
620             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
621             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
622             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
623
624             fjx0             = _mm_setzero_ps();
625             fjy0             = _mm_setzero_ps();
626             fjz0             = _mm_setzero_ps();
627             fjx1             = _mm_setzero_ps();
628             fjy1             = _mm_setzero_ps();
629             fjz1             = _mm_setzero_ps();
630             fjx2             = _mm_setzero_ps();
631             fjy2             = _mm_setzero_ps();
632             fjz2             = _mm_setzero_ps();
633             fjx3             = _mm_setzero_ps();
634             fjy3             = _mm_setzero_ps();
635             fjz3             = _mm_setzero_ps();
636
637             /**************************
638              * CALCULATE INTERACTIONS *
639              **************************/
640
641             /* LENNARD-JONES DISPERSION/REPULSION */
642
643             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
644             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
645             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
646             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
647             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
648
649             /* Update potential sum for this i atom from the interaction with this j atom. */
650             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
651             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
652
653             fscal            = fvdw;
654
655             fscal            = _mm_andnot_ps(dummy_mask,fscal);
656
657              /* Update vectorial force */
658             fix0             = _mm_macc_ps(dx00,fscal,fix0);
659             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
660             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
661
662             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
663             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
664             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
665
666             /**************************
667              * CALCULATE INTERACTIONS *
668              **************************/
669
670             /* COULOMB ELECTROSTATICS */
671             velec            = _mm_mul_ps(qq11,rinv11);
672             felec            = _mm_mul_ps(velec,rinvsq11);
673
674             /* Update potential sum for this i atom from the interaction with this j atom. */
675             velec            = _mm_andnot_ps(dummy_mask,velec);
676             velecsum         = _mm_add_ps(velecsum,velec);
677
678             fscal            = felec;
679
680             fscal            = _mm_andnot_ps(dummy_mask,fscal);
681
682              /* Update vectorial force */
683             fix1             = _mm_macc_ps(dx11,fscal,fix1);
684             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
685             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
686
687             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
688             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
689             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
690
691             /**************************
692              * CALCULATE INTERACTIONS *
693              **************************/
694
695             /* COULOMB ELECTROSTATICS */
696             velec            = _mm_mul_ps(qq12,rinv12);
697             felec            = _mm_mul_ps(velec,rinvsq12);
698
699             /* Update potential sum for this i atom from the interaction with this j atom. */
700             velec            = _mm_andnot_ps(dummy_mask,velec);
701             velecsum         = _mm_add_ps(velecsum,velec);
702
703             fscal            = felec;
704
705             fscal            = _mm_andnot_ps(dummy_mask,fscal);
706
707              /* Update vectorial force */
708             fix1             = _mm_macc_ps(dx12,fscal,fix1);
709             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
710             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
711
712             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
713             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
714             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
715
716             /**************************
717              * CALCULATE INTERACTIONS *
718              **************************/
719
720             /* COULOMB ELECTROSTATICS */
721             velec            = _mm_mul_ps(qq13,rinv13);
722             felec            = _mm_mul_ps(velec,rinvsq13);
723
724             /* Update potential sum for this i atom from the interaction with this j atom. */
725             velec            = _mm_andnot_ps(dummy_mask,velec);
726             velecsum         = _mm_add_ps(velecsum,velec);
727
728             fscal            = felec;
729
730             fscal            = _mm_andnot_ps(dummy_mask,fscal);
731
732              /* Update vectorial force */
733             fix1             = _mm_macc_ps(dx13,fscal,fix1);
734             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
735             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
736
737             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
738             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
739             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
740
741             /**************************
742              * CALCULATE INTERACTIONS *
743              **************************/
744
745             /* COULOMB ELECTROSTATICS */
746             velec            = _mm_mul_ps(qq21,rinv21);
747             felec            = _mm_mul_ps(velec,rinvsq21);
748
749             /* Update potential sum for this i atom from the interaction with this j atom. */
750             velec            = _mm_andnot_ps(dummy_mask,velec);
751             velecsum         = _mm_add_ps(velecsum,velec);
752
753             fscal            = felec;
754
755             fscal            = _mm_andnot_ps(dummy_mask,fscal);
756
757              /* Update vectorial force */
758             fix2             = _mm_macc_ps(dx21,fscal,fix2);
759             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
760             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
761
762             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
763             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
764             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
765
766             /**************************
767              * CALCULATE INTERACTIONS *
768              **************************/
769
770             /* COULOMB ELECTROSTATICS */
771             velec            = _mm_mul_ps(qq22,rinv22);
772             felec            = _mm_mul_ps(velec,rinvsq22);
773
774             /* Update potential sum for this i atom from the interaction with this j atom. */
775             velec            = _mm_andnot_ps(dummy_mask,velec);
776             velecsum         = _mm_add_ps(velecsum,velec);
777
778             fscal            = felec;
779
780             fscal            = _mm_andnot_ps(dummy_mask,fscal);
781
782              /* Update vectorial force */
783             fix2             = _mm_macc_ps(dx22,fscal,fix2);
784             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
785             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
786
787             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
788             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
789             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
790
791             /**************************
792              * CALCULATE INTERACTIONS *
793              **************************/
794
795             /* COULOMB ELECTROSTATICS */
796             velec            = _mm_mul_ps(qq23,rinv23);
797             felec            = _mm_mul_ps(velec,rinvsq23);
798
799             /* Update potential sum for this i atom from the interaction with this j atom. */
800             velec            = _mm_andnot_ps(dummy_mask,velec);
801             velecsum         = _mm_add_ps(velecsum,velec);
802
803             fscal            = felec;
804
805             fscal            = _mm_andnot_ps(dummy_mask,fscal);
806
807              /* Update vectorial force */
808             fix2             = _mm_macc_ps(dx23,fscal,fix2);
809             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
810             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
811
812             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
813             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
814             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
815
816             /**************************
817              * CALCULATE INTERACTIONS *
818              **************************/
819
820             /* COULOMB ELECTROSTATICS */
821             velec            = _mm_mul_ps(qq31,rinv31);
822             felec            = _mm_mul_ps(velec,rinvsq31);
823
824             /* Update potential sum for this i atom from the interaction with this j atom. */
825             velec            = _mm_andnot_ps(dummy_mask,velec);
826             velecsum         = _mm_add_ps(velecsum,velec);
827
828             fscal            = felec;
829
830             fscal            = _mm_andnot_ps(dummy_mask,fscal);
831
832              /* Update vectorial force */
833             fix3             = _mm_macc_ps(dx31,fscal,fix3);
834             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
835             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
836
837             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
838             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
839             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
840
841             /**************************
842              * CALCULATE INTERACTIONS *
843              **************************/
844
845             /* COULOMB ELECTROSTATICS */
846             velec            = _mm_mul_ps(qq32,rinv32);
847             felec            = _mm_mul_ps(velec,rinvsq32);
848
849             /* Update potential sum for this i atom from the interaction with this j atom. */
850             velec            = _mm_andnot_ps(dummy_mask,velec);
851             velecsum         = _mm_add_ps(velecsum,velec);
852
853             fscal            = felec;
854
855             fscal            = _mm_andnot_ps(dummy_mask,fscal);
856
857              /* Update vectorial force */
858             fix3             = _mm_macc_ps(dx32,fscal,fix3);
859             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
860             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
861
862             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
863             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
864             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
865
866             /**************************
867              * CALCULATE INTERACTIONS *
868              **************************/
869
870             /* COULOMB ELECTROSTATICS */
871             velec            = _mm_mul_ps(qq33,rinv33);
872             felec            = _mm_mul_ps(velec,rinvsq33);
873
874             /* Update potential sum for this i atom from the interaction with this j atom. */
875             velec            = _mm_andnot_ps(dummy_mask,velec);
876             velecsum         = _mm_add_ps(velecsum,velec);
877
878             fscal            = felec;
879
880             fscal            = _mm_andnot_ps(dummy_mask,fscal);
881
882              /* Update vectorial force */
883             fix3             = _mm_macc_ps(dx33,fscal,fix3);
884             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
885             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
886
887             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
888             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
889             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
890
891             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
892             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
893             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
894             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
895
896             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
897                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
898                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
899
900             /* Inner loop uses 317 flops */
901         }
902
903         /* End of innermost loop */
904
905         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
906                                               f+i_coord_offset,fshift+i_shift_offset);
907
908         ggid                        = gid[iidx];
909         /* Update potential energies */
910         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
911         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
912
913         /* Increment number of inner iterations */
914         inneriter                  += j_index_end - j_index_start;
915
916         /* Outer loop uses 26 flops */
917     }
918
919     /* Increment number of outer iterations */
920     outeriter        += nri;
921
922     /* Update outer/inner flops */
923
924     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*317);
925 }
926 /*
927  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single
928  * Electrostatics interaction: Coulomb
929  * VdW interaction:            LennardJones
930  * Geometry:                   Water4-Water4
931  * Calculate force/pot:        Force
932  */
933 void
934 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single
935                     (t_nblist * gmx_restrict                nlist,
936                      rvec * gmx_restrict                    xx,
937                      rvec * gmx_restrict                    ff,
938                      t_forcerec * gmx_restrict              fr,
939                      t_mdatoms * gmx_restrict               mdatoms,
940                      nb_kernel_data_t * gmx_restrict        kernel_data,
941                      t_nrnb * gmx_restrict                  nrnb)
942 {
943     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
944      * just 0 for non-waters.
945      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
946      * jnr indices corresponding to data put in the four positions in the SIMD register.
947      */
948     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
949     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
950     int              jnrA,jnrB,jnrC,jnrD;
951     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
952     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
953     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
954     real             rcutoff_scalar;
955     real             *shiftvec,*fshift,*x,*f;
956     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
957     real             scratch[4*DIM];
958     __m128           fscal,rcutoff,rcutoff2,jidxall;
959     int              vdwioffset0;
960     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
961     int              vdwioffset1;
962     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
963     int              vdwioffset2;
964     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
965     int              vdwioffset3;
966     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
967     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
968     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
969     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
970     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
971     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
972     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
973     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
974     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
975     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
976     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
977     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
978     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
979     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
980     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
981     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
982     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
983     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
984     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
985     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
986     real             *charge;
987     int              nvdwtype;
988     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
989     int              *vdwtype;
990     real             *vdwparam;
991     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
992     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
993     __m128           dummy_mask,cutoff_mask;
994     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
995     __m128           one     = _mm_set1_ps(1.0);
996     __m128           two     = _mm_set1_ps(2.0);
997     x                = xx[0];
998     f                = ff[0];
999
1000     nri              = nlist->nri;
1001     iinr             = nlist->iinr;
1002     jindex           = nlist->jindex;
1003     jjnr             = nlist->jjnr;
1004     shiftidx         = nlist->shift;
1005     gid              = nlist->gid;
1006     shiftvec         = fr->shift_vec[0];
1007     fshift           = fr->fshift[0];
1008     facel            = _mm_set1_ps(fr->epsfac);
1009     charge           = mdatoms->chargeA;
1010     nvdwtype         = fr->ntype;
1011     vdwparam         = fr->nbfp;
1012     vdwtype          = mdatoms->typeA;
1013
1014     /* Setup water-specific parameters */
1015     inr              = nlist->iinr[0];
1016     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1017     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1018     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1019     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1020
1021     jq1              = _mm_set1_ps(charge[inr+1]);
1022     jq2              = _mm_set1_ps(charge[inr+2]);
1023     jq3              = _mm_set1_ps(charge[inr+3]);
1024     vdwjidx0A        = 2*vdwtype[inr+0];
1025     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1026     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1027     qq11             = _mm_mul_ps(iq1,jq1);
1028     qq12             = _mm_mul_ps(iq1,jq2);
1029     qq13             = _mm_mul_ps(iq1,jq3);
1030     qq21             = _mm_mul_ps(iq2,jq1);
1031     qq22             = _mm_mul_ps(iq2,jq2);
1032     qq23             = _mm_mul_ps(iq2,jq3);
1033     qq31             = _mm_mul_ps(iq3,jq1);
1034     qq32             = _mm_mul_ps(iq3,jq2);
1035     qq33             = _mm_mul_ps(iq3,jq3);
1036
1037     /* Avoid stupid compiler warnings */
1038     jnrA = jnrB = jnrC = jnrD = 0;
1039     j_coord_offsetA = 0;
1040     j_coord_offsetB = 0;
1041     j_coord_offsetC = 0;
1042     j_coord_offsetD = 0;
1043
1044     outeriter        = 0;
1045     inneriter        = 0;
1046
1047     for(iidx=0;iidx<4*DIM;iidx++)
1048     {
1049         scratch[iidx] = 0.0;
1050     }
1051
1052     /* Start outer loop over neighborlists */
1053     for(iidx=0; iidx<nri; iidx++)
1054     {
1055         /* Load shift vector for this list */
1056         i_shift_offset   = DIM*shiftidx[iidx];
1057
1058         /* Load limits for loop over neighbors */
1059         j_index_start    = jindex[iidx];
1060         j_index_end      = jindex[iidx+1];
1061
1062         /* Get outer coordinate index */
1063         inr              = iinr[iidx];
1064         i_coord_offset   = DIM*inr;
1065
1066         /* Load i particle coords and add shift vector */
1067         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1068                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1069
1070         fix0             = _mm_setzero_ps();
1071         fiy0             = _mm_setzero_ps();
1072         fiz0             = _mm_setzero_ps();
1073         fix1             = _mm_setzero_ps();
1074         fiy1             = _mm_setzero_ps();
1075         fiz1             = _mm_setzero_ps();
1076         fix2             = _mm_setzero_ps();
1077         fiy2             = _mm_setzero_ps();
1078         fiz2             = _mm_setzero_ps();
1079         fix3             = _mm_setzero_ps();
1080         fiy3             = _mm_setzero_ps();
1081         fiz3             = _mm_setzero_ps();
1082
1083         /* Start inner kernel loop */
1084         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1085         {
1086
1087             /* Get j neighbor index, and coordinate index */
1088             jnrA             = jjnr[jidx];
1089             jnrB             = jjnr[jidx+1];
1090             jnrC             = jjnr[jidx+2];
1091             jnrD             = jjnr[jidx+3];
1092             j_coord_offsetA  = DIM*jnrA;
1093             j_coord_offsetB  = DIM*jnrB;
1094             j_coord_offsetC  = DIM*jnrC;
1095             j_coord_offsetD  = DIM*jnrD;
1096
1097             /* load j atom coordinates */
1098             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1099                                               x+j_coord_offsetC,x+j_coord_offsetD,
1100                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1101                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1102
1103             /* Calculate displacement vector */
1104             dx00             = _mm_sub_ps(ix0,jx0);
1105             dy00             = _mm_sub_ps(iy0,jy0);
1106             dz00             = _mm_sub_ps(iz0,jz0);
1107             dx11             = _mm_sub_ps(ix1,jx1);
1108             dy11             = _mm_sub_ps(iy1,jy1);
1109             dz11             = _mm_sub_ps(iz1,jz1);
1110             dx12             = _mm_sub_ps(ix1,jx2);
1111             dy12             = _mm_sub_ps(iy1,jy2);
1112             dz12             = _mm_sub_ps(iz1,jz2);
1113             dx13             = _mm_sub_ps(ix1,jx3);
1114             dy13             = _mm_sub_ps(iy1,jy3);
1115             dz13             = _mm_sub_ps(iz1,jz3);
1116             dx21             = _mm_sub_ps(ix2,jx1);
1117             dy21             = _mm_sub_ps(iy2,jy1);
1118             dz21             = _mm_sub_ps(iz2,jz1);
1119             dx22             = _mm_sub_ps(ix2,jx2);
1120             dy22             = _mm_sub_ps(iy2,jy2);
1121             dz22             = _mm_sub_ps(iz2,jz2);
1122             dx23             = _mm_sub_ps(ix2,jx3);
1123             dy23             = _mm_sub_ps(iy2,jy3);
1124             dz23             = _mm_sub_ps(iz2,jz3);
1125             dx31             = _mm_sub_ps(ix3,jx1);
1126             dy31             = _mm_sub_ps(iy3,jy1);
1127             dz31             = _mm_sub_ps(iz3,jz1);
1128             dx32             = _mm_sub_ps(ix3,jx2);
1129             dy32             = _mm_sub_ps(iy3,jy2);
1130             dz32             = _mm_sub_ps(iz3,jz2);
1131             dx33             = _mm_sub_ps(ix3,jx3);
1132             dy33             = _mm_sub_ps(iy3,jy3);
1133             dz33             = _mm_sub_ps(iz3,jz3);
1134
1135             /* Calculate squared distance and things based on it */
1136             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1137             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1138             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1139             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1140             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1141             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1142             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1143             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1144             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1145             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1146
1147             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1148             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1149             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1150             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1151             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1152             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1153             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1154             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1155             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1156
1157             rinvsq00         = gmx_mm_inv_ps(rsq00);
1158             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1159             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1160             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1161             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1162             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1163             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1164             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1165             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1166             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1167
1168             fjx0             = _mm_setzero_ps();
1169             fjy0             = _mm_setzero_ps();
1170             fjz0             = _mm_setzero_ps();
1171             fjx1             = _mm_setzero_ps();
1172             fjy1             = _mm_setzero_ps();
1173             fjz1             = _mm_setzero_ps();
1174             fjx2             = _mm_setzero_ps();
1175             fjy2             = _mm_setzero_ps();
1176             fjz2             = _mm_setzero_ps();
1177             fjx3             = _mm_setzero_ps();
1178             fjy3             = _mm_setzero_ps();
1179             fjz3             = _mm_setzero_ps();
1180
1181             /**************************
1182              * CALCULATE INTERACTIONS *
1183              **************************/
1184
1185             /* LENNARD-JONES DISPERSION/REPULSION */
1186
1187             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1188             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1189
1190             fscal            = fvdw;
1191
1192              /* Update vectorial force */
1193             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1194             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1195             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1196
1197             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1198             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1199             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1200
1201             /**************************
1202              * CALCULATE INTERACTIONS *
1203              **************************/
1204
1205             /* COULOMB ELECTROSTATICS */
1206             velec            = _mm_mul_ps(qq11,rinv11);
1207             felec            = _mm_mul_ps(velec,rinvsq11);
1208
1209             fscal            = felec;
1210
1211              /* Update vectorial force */
1212             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1213             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1214             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1215
1216             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1217             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1218             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1219
1220             /**************************
1221              * CALCULATE INTERACTIONS *
1222              **************************/
1223
1224             /* COULOMB ELECTROSTATICS */
1225             velec            = _mm_mul_ps(qq12,rinv12);
1226             felec            = _mm_mul_ps(velec,rinvsq12);
1227
1228             fscal            = felec;
1229
1230              /* Update vectorial force */
1231             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1232             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1233             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1234
1235             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1236             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1237             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1238
1239             /**************************
1240              * CALCULATE INTERACTIONS *
1241              **************************/
1242
1243             /* COULOMB ELECTROSTATICS */
1244             velec            = _mm_mul_ps(qq13,rinv13);
1245             felec            = _mm_mul_ps(velec,rinvsq13);
1246
1247             fscal            = felec;
1248
1249              /* Update vectorial force */
1250             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1251             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1252             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1253
1254             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1255             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1256             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1257
1258             /**************************
1259              * CALCULATE INTERACTIONS *
1260              **************************/
1261
1262             /* COULOMB ELECTROSTATICS */
1263             velec            = _mm_mul_ps(qq21,rinv21);
1264             felec            = _mm_mul_ps(velec,rinvsq21);
1265
1266             fscal            = felec;
1267
1268              /* Update vectorial force */
1269             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1270             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1271             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1272
1273             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1274             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1275             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1276
1277             /**************************
1278              * CALCULATE INTERACTIONS *
1279              **************************/
1280
1281             /* COULOMB ELECTROSTATICS */
1282             velec            = _mm_mul_ps(qq22,rinv22);
1283             felec            = _mm_mul_ps(velec,rinvsq22);
1284
1285             fscal            = felec;
1286
1287              /* Update vectorial force */
1288             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1289             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1290             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1291
1292             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1293             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1294             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1295
1296             /**************************
1297              * CALCULATE INTERACTIONS *
1298              **************************/
1299
1300             /* COULOMB ELECTROSTATICS */
1301             velec            = _mm_mul_ps(qq23,rinv23);
1302             felec            = _mm_mul_ps(velec,rinvsq23);
1303
1304             fscal            = felec;
1305
1306              /* Update vectorial force */
1307             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1308             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1309             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1310
1311             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1312             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1313             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1314
1315             /**************************
1316              * CALCULATE INTERACTIONS *
1317              **************************/
1318
1319             /* COULOMB ELECTROSTATICS */
1320             velec            = _mm_mul_ps(qq31,rinv31);
1321             felec            = _mm_mul_ps(velec,rinvsq31);
1322
1323             fscal            = felec;
1324
1325              /* Update vectorial force */
1326             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1327             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1328             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1329
1330             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1331             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1332             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1333
1334             /**************************
1335              * CALCULATE INTERACTIONS *
1336              **************************/
1337
1338             /* COULOMB ELECTROSTATICS */
1339             velec            = _mm_mul_ps(qq32,rinv32);
1340             felec            = _mm_mul_ps(velec,rinvsq32);
1341
1342             fscal            = felec;
1343
1344              /* Update vectorial force */
1345             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1346             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1347             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1348
1349             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1350             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1351             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1352
1353             /**************************
1354              * CALCULATE INTERACTIONS *
1355              **************************/
1356
1357             /* COULOMB ELECTROSTATICS */
1358             velec            = _mm_mul_ps(qq33,rinv33);
1359             felec            = _mm_mul_ps(velec,rinvsq33);
1360
1361             fscal            = felec;
1362
1363              /* Update vectorial force */
1364             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1365             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1366             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1367
1368             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1369             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1370             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1371
1372             fjptrA             = f+j_coord_offsetA;
1373             fjptrB             = f+j_coord_offsetB;
1374             fjptrC             = f+j_coord_offsetC;
1375             fjptrD             = f+j_coord_offsetD;
1376
1377             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1378                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1379                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1380
1381             /* Inner loop uses 303 flops */
1382         }
1383
1384         if(jidx<j_index_end)
1385         {
1386
1387             /* Get j neighbor index, and coordinate index */
1388             jnrlistA         = jjnr[jidx];
1389             jnrlistB         = jjnr[jidx+1];
1390             jnrlistC         = jjnr[jidx+2];
1391             jnrlistD         = jjnr[jidx+3];
1392             /* Sign of each element will be negative for non-real atoms.
1393              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1394              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1395              */
1396             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1397             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1398             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1399             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1400             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1401             j_coord_offsetA  = DIM*jnrA;
1402             j_coord_offsetB  = DIM*jnrB;
1403             j_coord_offsetC  = DIM*jnrC;
1404             j_coord_offsetD  = DIM*jnrD;
1405
1406             /* load j atom coordinates */
1407             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1408                                               x+j_coord_offsetC,x+j_coord_offsetD,
1409                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1410                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1411
1412             /* Calculate displacement vector */
1413             dx00             = _mm_sub_ps(ix0,jx0);
1414             dy00             = _mm_sub_ps(iy0,jy0);
1415             dz00             = _mm_sub_ps(iz0,jz0);
1416             dx11             = _mm_sub_ps(ix1,jx1);
1417             dy11             = _mm_sub_ps(iy1,jy1);
1418             dz11             = _mm_sub_ps(iz1,jz1);
1419             dx12             = _mm_sub_ps(ix1,jx2);
1420             dy12             = _mm_sub_ps(iy1,jy2);
1421             dz12             = _mm_sub_ps(iz1,jz2);
1422             dx13             = _mm_sub_ps(ix1,jx3);
1423             dy13             = _mm_sub_ps(iy1,jy3);
1424             dz13             = _mm_sub_ps(iz1,jz3);
1425             dx21             = _mm_sub_ps(ix2,jx1);
1426             dy21             = _mm_sub_ps(iy2,jy1);
1427             dz21             = _mm_sub_ps(iz2,jz1);
1428             dx22             = _mm_sub_ps(ix2,jx2);
1429             dy22             = _mm_sub_ps(iy2,jy2);
1430             dz22             = _mm_sub_ps(iz2,jz2);
1431             dx23             = _mm_sub_ps(ix2,jx3);
1432             dy23             = _mm_sub_ps(iy2,jy3);
1433             dz23             = _mm_sub_ps(iz2,jz3);
1434             dx31             = _mm_sub_ps(ix3,jx1);
1435             dy31             = _mm_sub_ps(iy3,jy1);
1436             dz31             = _mm_sub_ps(iz3,jz1);
1437             dx32             = _mm_sub_ps(ix3,jx2);
1438             dy32             = _mm_sub_ps(iy3,jy2);
1439             dz32             = _mm_sub_ps(iz3,jz2);
1440             dx33             = _mm_sub_ps(ix3,jx3);
1441             dy33             = _mm_sub_ps(iy3,jy3);
1442             dz33             = _mm_sub_ps(iz3,jz3);
1443
1444             /* Calculate squared distance and things based on it */
1445             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1446             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1447             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1448             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1449             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1450             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1451             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1452             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1453             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1454             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1455
1456             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1457             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1458             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1459             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1460             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1461             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1462             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1463             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1464             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1465
1466             rinvsq00         = gmx_mm_inv_ps(rsq00);
1467             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1468             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1469             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1470             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1471             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1472             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1473             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1474             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1475             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1476
1477             fjx0             = _mm_setzero_ps();
1478             fjy0             = _mm_setzero_ps();
1479             fjz0             = _mm_setzero_ps();
1480             fjx1             = _mm_setzero_ps();
1481             fjy1             = _mm_setzero_ps();
1482             fjz1             = _mm_setzero_ps();
1483             fjx2             = _mm_setzero_ps();
1484             fjy2             = _mm_setzero_ps();
1485             fjz2             = _mm_setzero_ps();
1486             fjx3             = _mm_setzero_ps();
1487             fjy3             = _mm_setzero_ps();
1488             fjz3             = _mm_setzero_ps();
1489
1490             /**************************
1491              * CALCULATE INTERACTIONS *
1492              **************************/
1493
1494             /* LENNARD-JONES DISPERSION/REPULSION */
1495
1496             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1497             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1498
1499             fscal            = fvdw;
1500
1501             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1502
1503              /* Update vectorial force */
1504             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1505             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1506             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1507
1508             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1509             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1510             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1511
1512             /**************************
1513              * CALCULATE INTERACTIONS *
1514              **************************/
1515
1516             /* COULOMB ELECTROSTATICS */
1517             velec            = _mm_mul_ps(qq11,rinv11);
1518             felec            = _mm_mul_ps(velec,rinvsq11);
1519
1520             fscal            = felec;
1521
1522             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1523
1524              /* Update vectorial force */
1525             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1526             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1527             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1528
1529             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1530             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1531             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1532
1533             /**************************
1534              * CALCULATE INTERACTIONS *
1535              **************************/
1536
1537             /* COULOMB ELECTROSTATICS */
1538             velec            = _mm_mul_ps(qq12,rinv12);
1539             felec            = _mm_mul_ps(velec,rinvsq12);
1540
1541             fscal            = felec;
1542
1543             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1544
1545              /* Update vectorial force */
1546             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1547             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1548             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1549
1550             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1551             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1552             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1553
1554             /**************************
1555              * CALCULATE INTERACTIONS *
1556              **************************/
1557
1558             /* COULOMB ELECTROSTATICS */
1559             velec            = _mm_mul_ps(qq13,rinv13);
1560             felec            = _mm_mul_ps(velec,rinvsq13);
1561
1562             fscal            = felec;
1563
1564             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1565
1566              /* Update vectorial force */
1567             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1568             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1569             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1570
1571             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1572             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1573             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             /* COULOMB ELECTROSTATICS */
1580             velec            = _mm_mul_ps(qq21,rinv21);
1581             felec            = _mm_mul_ps(velec,rinvsq21);
1582
1583             fscal            = felec;
1584
1585             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1586
1587              /* Update vectorial force */
1588             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1589             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1590             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1591
1592             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1593             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1594             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1595
1596             /**************************
1597              * CALCULATE INTERACTIONS *
1598              **************************/
1599
1600             /* COULOMB ELECTROSTATICS */
1601             velec            = _mm_mul_ps(qq22,rinv22);
1602             felec            = _mm_mul_ps(velec,rinvsq22);
1603
1604             fscal            = felec;
1605
1606             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1607
1608              /* Update vectorial force */
1609             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1610             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1611             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1612
1613             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1614             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1615             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1616
1617             /**************************
1618              * CALCULATE INTERACTIONS *
1619              **************************/
1620
1621             /* COULOMB ELECTROSTATICS */
1622             velec            = _mm_mul_ps(qq23,rinv23);
1623             felec            = _mm_mul_ps(velec,rinvsq23);
1624
1625             fscal            = felec;
1626
1627             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1628
1629              /* Update vectorial force */
1630             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1631             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1632             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1633
1634             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1635             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1636             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1637
1638             /**************************
1639              * CALCULATE INTERACTIONS *
1640              **************************/
1641
1642             /* COULOMB ELECTROSTATICS */
1643             velec            = _mm_mul_ps(qq31,rinv31);
1644             felec            = _mm_mul_ps(velec,rinvsq31);
1645
1646             fscal            = felec;
1647
1648             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1649
1650              /* Update vectorial force */
1651             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1652             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1653             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1654
1655             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1656             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1657             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1658
1659             /**************************
1660              * CALCULATE INTERACTIONS *
1661              **************************/
1662
1663             /* COULOMB ELECTROSTATICS */
1664             velec            = _mm_mul_ps(qq32,rinv32);
1665             felec            = _mm_mul_ps(velec,rinvsq32);
1666
1667             fscal            = felec;
1668
1669             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1670
1671              /* Update vectorial force */
1672             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1673             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1674             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1675
1676             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1677             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1678             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1679
1680             /**************************
1681              * CALCULATE INTERACTIONS *
1682              **************************/
1683
1684             /* COULOMB ELECTROSTATICS */
1685             velec            = _mm_mul_ps(qq33,rinv33);
1686             felec            = _mm_mul_ps(velec,rinvsq33);
1687
1688             fscal            = felec;
1689
1690             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1691
1692              /* Update vectorial force */
1693             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1694             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1695             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1696
1697             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1698             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1699             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1700
1701             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1702             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1703             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1704             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1705
1706             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1707                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1708                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1709
1710             /* Inner loop uses 303 flops */
1711         }
1712
1713         /* End of innermost loop */
1714
1715         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1716                                               f+i_coord_offset,fshift+i_shift_offset);
1717
1718         /* Increment number of inner iterations */
1719         inneriter                  += j_index_end - j_index_start;
1720
1721         /* Outer loop uses 24 flops */
1722     }
1723
1724     /* Increment number of outer iterations */
1725     outeriter        += nri;
1726
1727     /* Update outer/inner flops */
1728
1729     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
1730 }