Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCoul_VdwLJ_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single
54  * Electrostatics interaction: Coulomb
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwioffset3;
92     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
112     real             *charge;
113     int              nvdwtype;
114     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115     int              *vdwtype;
116     real             *vdwparam;
117     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
118     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
119     __m128           dummy_mask,cutoff_mask;
120     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
121     __m128           one     = _mm_set1_ps(1.0);
122     __m128           two     = _mm_set1_ps(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm_set1_ps(fr->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     /* Setup water-specific parameters */
141     inr              = nlist->iinr[0];
142     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
143     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
144     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
145     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
146
147     jq1              = _mm_set1_ps(charge[inr+1]);
148     jq2              = _mm_set1_ps(charge[inr+2]);
149     jq3              = _mm_set1_ps(charge[inr+3]);
150     vdwjidx0A        = 2*vdwtype[inr+0];
151     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
152     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
153     qq11             = _mm_mul_ps(iq1,jq1);
154     qq12             = _mm_mul_ps(iq1,jq2);
155     qq13             = _mm_mul_ps(iq1,jq3);
156     qq21             = _mm_mul_ps(iq2,jq1);
157     qq22             = _mm_mul_ps(iq2,jq2);
158     qq23             = _mm_mul_ps(iq2,jq3);
159     qq31             = _mm_mul_ps(iq3,jq1);
160     qq32             = _mm_mul_ps(iq3,jq2);
161     qq33             = _mm_mul_ps(iq3,jq3);
162
163     /* Avoid stupid compiler warnings */
164     jnrA = jnrB = jnrC = jnrD = 0;
165     j_coord_offsetA = 0;
166     j_coord_offsetB = 0;
167     j_coord_offsetC = 0;
168     j_coord_offsetD = 0;
169
170     outeriter        = 0;
171     inneriter        = 0;
172
173     for(iidx=0;iidx<4*DIM;iidx++)
174     {
175         scratch[iidx] = 0.0;
176     }
177
178     /* Start outer loop over neighborlists */
179     for(iidx=0; iidx<nri; iidx++)
180     {
181         /* Load shift vector for this list */
182         i_shift_offset   = DIM*shiftidx[iidx];
183
184         /* Load limits for loop over neighbors */
185         j_index_start    = jindex[iidx];
186         j_index_end      = jindex[iidx+1];
187
188         /* Get outer coordinate index */
189         inr              = iinr[iidx];
190         i_coord_offset   = DIM*inr;
191
192         /* Load i particle coords and add shift vector */
193         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195
196         fix0             = _mm_setzero_ps();
197         fiy0             = _mm_setzero_ps();
198         fiz0             = _mm_setzero_ps();
199         fix1             = _mm_setzero_ps();
200         fiy1             = _mm_setzero_ps();
201         fiz1             = _mm_setzero_ps();
202         fix2             = _mm_setzero_ps();
203         fiy2             = _mm_setzero_ps();
204         fiz2             = _mm_setzero_ps();
205         fix3             = _mm_setzero_ps();
206         fiy3             = _mm_setzero_ps();
207         fiz3             = _mm_setzero_ps();
208
209         /* Reset potential sums */
210         velecsum         = _mm_setzero_ps();
211         vvdwsum          = _mm_setzero_ps();
212
213         /* Start inner kernel loop */
214         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215         {
216
217             /* Get j neighbor index, and coordinate index */
218             jnrA             = jjnr[jidx];
219             jnrB             = jjnr[jidx+1];
220             jnrC             = jjnr[jidx+2];
221             jnrD             = jjnr[jidx+3];
222             j_coord_offsetA  = DIM*jnrA;
223             j_coord_offsetB  = DIM*jnrB;
224             j_coord_offsetC  = DIM*jnrC;
225             j_coord_offsetD  = DIM*jnrD;
226
227             /* load j atom coordinates */
228             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
229                                               x+j_coord_offsetC,x+j_coord_offsetD,
230                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
231                                               &jy2,&jz2,&jx3,&jy3,&jz3);
232
233             /* Calculate displacement vector */
234             dx00             = _mm_sub_ps(ix0,jx0);
235             dy00             = _mm_sub_ps(iy0,jy0);
236             dz00             = _mm_sub_ps(iz0,jz0);
237             dx11             = _mm_sub_ps(ix1,jx1);
238             dy11             = _mm_sub_ps(iy1,jy1);
239             dz11             = _mm_sub_ps(iz1,jz1);
240             dx12             = _mm_sub_ps(ix1,jx2);
241             dy12             = _mm_sub_ps(iy1,jy2);
242             dz12             = _mm_sub_ps(iz1,jz2);
243             dx13             = _mm_sub_ps(ix1,jx3);
244             dy13             = _mm_sub_ps(iy1,jy3);
245             dz13             = _mm_sub_ps(iz1,jz3);
246             dx21             = _mm_sub_ps(ix2,jx1);
247             dy21             = _mm_sub_ps(iy2,jy1);
248             dz21             = _mm_sub_ps(iz2,jz1);
249             dx22             = _mm_sub_ps(ix2,jx2);
250             dy22             = _mm_sub_ps(iy2,jy2);
251             dz22             = _mm_sub_ps(iz2,jz2);
252             dx23             = _mm_sub_ps(ix2,jx3);
253             dy23             = _mm_sub_ps(iy2,jy3);
254             dz23             = _mm_sub_ps(iz2,jz3);
255             dx31             = _mm_sub_ps(ix3,jx1);
256             dy31             = _mm_sub_ps(iy3,jy1);
257             dz31             = _mm_sub_ps(iz3,jz1);
258             dx32             = _mm_sub_ps(ix3,jx2);
259             dy32             = _mm_sub_ps(iy3,jy2);
260             dz32             = _mm_sub_ps(iz3,jz2);
261             dx33             = _mm_sub_ps(ix3,jx3);
262             dy33             = _mm_sub_ps(iy3,jy3);
263             dz33             = _mm_sub_ps(iz3,jz3);
264
265             /* Calculate squared distance and things based on it */
266             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
267             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
270             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
272             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
273             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
274             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
275             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
276
277             rinv11           = gmx_mm_invsqrt_ps(rsq11);
278             rinv12           = gmx_mm_invsqrt_ps(rsq12);
279             rinv13           = gmx_mm_invsqrt_ps(rsq13);
280             rinv21           = gmx_mm_invsqrt_ps(rsq21);
281             rinv22           = gmx_mm_invsqrt_ps(rsq22);
282             rinv23           = gmx_mm_invsqrt_ps(rsq23);
283             rinv31           = gmx_mm_invsqrt_ps(rsq31);
284             rinv32           = gmx_mm_invsqrt_ps(rsq32);
285             rinv33           = gmx_mm_invsqrt_ps(rsq33);
286
287             rinvsq00         = gmx_mm_inv_ps(rsq00);
288             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
289             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
290             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
291             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
292             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
293             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
294             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
295             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
296             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
297
298             fjx0             = _mm_setzero_ps();
299             fjy0             = _mm_setzero_ps();
300             fjz0             = _mm_setzero_ps();
301             fjx1             = _mm_setzero_ps();
302             fjy1             = _mm_setzero_ps();
303             fjz1             = _mm_setzero_ps();
304             fjx2             = _mm_setzero_ps();
305             fjy2             = _mm_setzero_ps();
306             fjz2             = _mm_setzero_ps();
307             fjx3             = _mm_setzero_ps();
308             fjy3             = _mm_setzero_ps();
309             fjz3             = _mm_setzero_ps();
310
311             /**************************
312              * CALCULATE INTERACTIONS *
313              **************************/
314
315             /* LENNARD-JONES DISPERSION/REPULSION */
316
317             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
318             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
319             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
320             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
321             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
322
323             /* Update potential sum for this i atom from the interaction with this j atom. */
324             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
325
326             fscal            = fvdw;
327
328              /* Update vectorial force */
329             fix0             = _mm_macc_ps(dx00,fscal,fix0);
330             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
331             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
332
333             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
334             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
335             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
336
337             /**************************
338              * CALCULATE INTERACTIONS *
339              **************************/
340
341             /* COULOMB ELECTROSTATICS */
342             velec            = _mm_mul_ps(qq11,rinv11);
343             felec            = _mm_mul_ps(velec,rinvsq11);
344
345             /* Update potential sum for this i atom from the interaction with this j atom. */
346             velecsum         = _mm_add_ps(velecsum,velec);
347
348             fscal            = felec;
349
350              /* Update vectorial force */
351             fix1             = _mm_macc_ps(dx11,fscal,fix1);
352             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
353             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
354
355             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
356             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
357             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
358
359             /**************************
360              * CALCULATE INTERACTIONS *
361              **************************/
362
363             /* COULOMB ELECTROSTATICS */
364             velec            = _mm_mul_ps(qq12,rinv12);
365             felec            = _mm_mul_ps(velec,rinvsq12);
366
367             /* Update potential sum for this i atom from the interaction with this j atom. */
368             velecsum         = _mm_add_ps(velecsum,velec);
369
370             fscal            = felec;
371
372              /* Update vectorial force */
373             fix1             = _mm_macc_ps(dx12,fscal,fix1);
374             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
375             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
376
377             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
378             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
379             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
380
381             /**************************
382              * CALCULATE INTERACTIONS *
383              **************************/
384
385             /* COULOMB ELECTROSTATICS */
386             velec            = _mm_mul_ps(qq13,rinv13);
387             felec            = _mm_mul_ps(velec,rinvsq13);
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm_add_ps(velecsum,velec);
391
392             fscal            = felec;
393
394              /* Update vectorial force */
395             fix1             = _mm_macc_ps(dx13,fscal,fix1);
396             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
397             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
398
399             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
400             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
401             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
402
403             /**************************
404              * CALCULATE INTERACTIONS *
405              **************************/
406
407             /* COULOMB ELECTROSTATICS */
408             velec            = _mm_mul_ps(qq21,rinv21);
409             felec            = _mm_mul_ps(velec,rinvsq21);
410
411             /* Update potential sum for this i atom from the interaction with this j atom. */
412             velecsum         = _mm_add_ps(velecsum,velec);
413
414             fscal            = felec;
415
416              /* Update vectorial force */
417             fix2             = _mm_macc_ps(dx21,fscal,fix2);
418             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
419             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
420
421             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
422             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
423             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
424
425             /**************************
426              * CALCULATE INTERACTIONS *
427              **************************/
428
429             /* COULOMB ELECTROSTATICS */
430             velec            = _mm_mul_ps(qq22,rinv22);
431             felec            = _mm_mul_ps(velec,rinvsq22);
432
433             /* Update potential sum for this i atom from the interaction with this j atom. */
434             velecsum         = _mm_add_ps(velecsum,velec);
435
436             fscal            = felec;
437
438              /* Update vectorial force */
439             fix2             = _mm_macc_ps(dx22,fscal,fix2);
440             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
441             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
442
443             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
444             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
445             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
446
447             /**************************
448              * CALCULATE INTERACTIONS *
449              **************************/
450
451             /* COULOMB ELECTROSTATICS */
452             velec            = _mm_mul_ps(qq23,rinv23);
453             felec            = _mm_mul_ps(velec,rinvsq23);
454
455             /* Update potential sum for this i atom from the interaction with this j atom. */
456             velecsum         = _mm_add_ps(velecsum,velec);
457
458             fscal            = felec;
459
460              /* Update vectorial force */
461             fix2             = _mm_macc_ps(dx23,fscal,fix2);
462             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
463             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
464
465             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
466             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
467             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
468
469             /**************************
470              * CALCULATE INTERACTIONS *
471              **************************/
472
473             /* COULOMB ELECTROSTATICS */
474             velec            = _mm_mul_ps(qq31,rinv31);
475             felec            = _mm_mul_ps(velec,rinvsq31);
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velecsum         = _mm_add_ps(velecsum,velec);
479
480             fscal            = felec;
481
482              /* Update vectorial force */
483             fix3             = _mm_macc_ps(dx31,fscal,fix3);
484             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
485             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
486
487             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
488             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
489             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
490
491             /**************************
492              * CALCULATE INTERACTIONS *
493              **************************/
494
495             /* COULOMB ELECTROSTATICS */
496             velec            = _mm_mul_ps(qq32,rinv32);
497             felec            = _mm_mul_ps(velec,rinvsq32);
498
499             /* Update potential sum for this i atom from the interaction with this j atom. */
500             velecsum         = _mm_add_ps(velecsum,velec);
501
502             fscal            = felec;
503
504              /* Update vectorial force */
505             fix3             = _mm_macc_ps(dx32,fscal,fix3);
506             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
507             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
508
509             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
510             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
511             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
512
513             /**************************
514              * CALCULATE INTERACTIONS *
515              **************************/
516
517             /* COULOMB ELECTROSTATICS */
518             velec            = _mm_mul_ps(qq33,rinv33);
519             felec            = _mm_mul_ps(velec,rinvsq33);
520
521             /* Update potential sum for this i atom from the interaction with this j atom. */
522             velecsum         = _mm_add_ps(velecsum,velec);
523
524             fscal            = felec;
525
526              /* Update vectorial force */
527             fix3             = _mm_macc_ps(dx33,fscal,fix3);
528             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
529             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
530
531             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
532             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
533             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
534
535             fjptrA             = f+j_coord_offsetA;
536             fjptrB             = f+j_coord_offsetB;
537             fjptrC             = f+j_coord_offsetC;
538             fjptrD             = f+j_coord_offsetD;
539
540             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
541                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
542                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
543
544             /* Inner loop uses 317 flops */
545         }
546
547         if(jidx<j_index_end)
548         {
549
550             /* Get j neighbor index, and coordinate index */
551             jnrlistA         = jjnr[jidx];
552             jnrlistB         = jjnr[jidx+1];
553             jnrlistC         = jjnr[jidx+2];
554             jnrlistD         = jjnr[jidx+3];
555             /* Sign of each element will be negative for non-real atoms.
556              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
557              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
558              */
559             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
560             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
561             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
562             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
563             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
564             j_coord_offsetA  = DIM*jnrA;
565             j_coord_offsetB  = DIM*jnrB;
566             j_coord_offsetC  = DIM*jnrC;
567             j_coord_offsetD  = DIM*jnrD;
568
569             /* load j atom coordinates */
570             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
571                                               x+j_coord_offsetC,x+j_coord_offsetD,
572                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
573                                               &jy2,&jz2,&jx3,&jy3,&jz3);
574
575             /* Calculate displacement vector */
576             dx00             = _mm_sub_ps(ix0,jx0);
577             dy00             = _mm_sub_ps(iy0,jy0);
578             dz00             = _mm_sub_ps(iz0,jz0);
579             dx11             = _mm_sub_ps(ix1,jx1);
580             dy11             = _mm_sub_ps(iy1,jy1);
581             dz11             = _mm_sub_ps(iz1,jz1);
582             dx12             = _mm_sub_ps(ix1,jx2);
583             dy12             = _mm_sub_ps(iy1,jy2);
584             dz12             = _mm_sub_ps(iz1,jz2);
585             dx13             = _mm_sub_ps(ix1,jx3);
586             dy13             = _mm_sub_ps(iy1,jy3);
587             dz13             = _mm_sub_ps(iz1,jz3);
588             dx21             = _mm_sub_ps(ix2,jx1);
589             dy21             = _mm_sub_ps(iy2,jy1);
590             dz21             = _mm_sub_ps(iz2,jz1);
591             dx22             = _mm_sub_ps(ix2,jx2);
592             dy22             = _mm_sub_ps(iy2,jy2);
593             dz22             = _mm_sub_ps(iz2,jz2);
594             dx23             = _mm_sub_ps(ix2,jx3);
595             dy23             = _mm_sub_ps(iy2,jy3);
596             dz23             = _mm_sub_ps(iz2,jz3);
597             dx31             = _mm_sub_ps(ix3,jx1);
598             dy31             = _mm_sub_ps(iy3,jy1);
599             dz31             = _mm_sub_ps(iz3,jz1);
600             dx32             = _mm_sub_ps(ix3,jx2);
601             dy32             = _mm_sub_ps(iy3,jy2);
602             dz32             = _mm_sub_ps(iz3,jz2);
603             dx33             = _mm_sub_ps(ix3,jx3);
604             dy33             = _mm_sub_ps(iy3,jy3);
605             dz33             = _mm_sub_ps(iz3,jz3);
606
607             /* Calculate squared distance and things based on it */
608             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
609             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
610             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
611             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
612             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
613             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
614             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
615             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
616             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
617             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
618
619             rinv11           = gmx_mm_invsqrt_ps(rsq11);
620             rinv12           = gmx_mm_invsqrt_ps(rsq12);
621             rinv13           = gmx_mm_invsqrt_ps(rsq13);
622             rinv21           = gmx_mm_invsqrt_ps(rsq21);
623             rinv22           = gmx_mm_invsqrt_ps(rsq22);
624             rinv23           = gmx_mm_invsqrt_ps(rsq23);
625             rinv31           = gmx_mm_invsqrt_ps(rsq31);
626             rinv32           = gmx_mm_invsqrt_ps(rsq32);
627             rinv33           = gmx_mm_invsqrt_ps(rsq33);
628
629             rinvsq00         = gmx_mm_inv_ps(rsq00);
630             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
631             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
632             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
633             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
634             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
635             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
636             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
637             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
638             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
639
640             fjx0             = _mm_setzero_ps();
641             fjy0             = _mm_setzero_ps();
642             fjz0             = _mm_setzero_ps();
643             fjx1             = _mm_setzero_ps();
644             fjy1             = _mm_setzero_ps();
645             fjz1             = _mm_setzero_ps();
646             fjx2             = _mm_setzero_ps();
647             fjy2             = _mm_setzero_ps();
648             fjz2             = _mm_setzero_ps();
649             fjx3             = _mm_setzero_ps();
650             fjy3             = _mm_setzero_ps();
651             fjz3             = _mm_setzero_ps();
652
653             /**************************
654              * CALCULATE INTERACTIONS *
655              **************************/
656
657             /* LENNARD-JONES DISPERSION/REPULSION */
658
659             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
660             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
661             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
662             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
663             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
664
665             /* Update potential sum for this i atom from the interaction with this j atom. */
666             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
667             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
668
669             fscal            = fvdw;
670
671             fscal            = _mm_andnot_ps(dummy_mask,fscal);
672
673              /* Update vectorial force */
674             fix0             = _mm_macc_ps(dx00,fscal,fix0);
675             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
676             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
677
678             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
679             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
680             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
681
682             /**************************
683              * CALCULATE INTERACTIONS *
684              **************************/
685
686             /* COULOMB ELECTROSTATICS */
687             velec            = _mm_mul_ps(qq11,rinv11);
688             felec            = _mm_mul_ps(velec,rinvsq11);
689
690             /* Update potential sum for this i atom from the interaction with this j atom. */
691             velec            = _mm_andnot_ps(dummy_mask,velec);
692             velecsum         = _mm_add_ps(velecsum,velec);
693
694             fscal            = felec;
695
696             fscal            = _mm_andnot_ps(dummy_mask,fscal);
697
698              /* Update vectorial force */
699             fix1             = _mm_macc_ps(dx11,fscal,fix1);
700             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
701             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
702
703             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
704             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
705             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
706
707             /**************************
708              * CALCULATE INTERACTIONS *
709              **************************/
710
711             /* COULOMB ELECTROSTATICS */
712             velec            = _mm_mul_ps(qq12,rinv12);
713             felec            = _mm_mul_ps(velec,rinvsq12);
714
715             /* Update potential sum for this i atom from the interaction with this j atom. */
716             velec            = _mm_andnot_ps(dummy_mask,velec);
717             velecsum         = _mm_add_ps(velecsum,velec);
718
719             fscal            = felec;
720
721             fscal            = _mm_andnot_ps(dummy_mask,fscal);
722
723              /* Update vectorial force */
724             fix1             = _mm_macc_ps(dx12,fscal,fix1);
725             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
726             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
727
728             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
729             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
730             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
731
732             /**************************
733              * CALCULATE INTERACTIONS *
734              **************************/
735
736             /* COULOMB ELECTROSTATICS */
737             velec            = _mm_mul_ps(qq13,rinv13);
738             felec            = _mm_mul_ps(velec,rinvsq13);
739
740             /* Update potential sum for this i atom from the interaction with this j atom. */
741             velec            = _mm_andnot_ps(dummy_mask,velec);
742             velecsum         = _mm_add_ps(velecsum,velec);
743
744             fscal            = felec;
745
746             fscal            = _mm_andnot_ps(dummy_mask,fscal);
747
748              /* Update vectorial force */
749             fix1             = _mm_macc_ps(dx13,fscal,fix1);
750             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
751             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
752
753             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
754             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
755             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
756
757             /**************************
758              * CALCULATE INTERACTIONS *
759              **************************/
760
761             /* COULOMB ELECTROSTATICS */
762             velec            = _mm_mul_ps(qq21,rinv21);
763             felec            = _mm_mul_ps(velec,rinvsq21);
764
765             /* Update potential sum for this i atom from the interaction with this j atom. */
766             velec            = _mm_andnot_ps(dummy_mask,velec);
767             velecsum         = _mm_add_ps(velecsum,velec);
768
769             fscal            = felec;
770
771             fscal            = _mm_andnot_ps(dummy_mask,fscal);
772
773              /* Update vectorial force */
774             fix2             = _mm_macc_ps(dx21,fscal,fix2);
775             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
776             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
777
778             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
779             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
780             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
781
782             /**************************
783              * CALCULATE INTERACTIONS *
784              **************************/
785
786             /* COULOMB ELECTROSTATICS */
787             velec            = _mm_mul_ps(qq22,rinv22);
788             felec            = _mm_mul_ps(velec,rinvsq22);
789
790             /* Update potential sum for this i atom from the interaction with this j atom. */
791             velec            = _mm_andnot_ps(dummy_mask,velec);
792             velecsum         = _mm_add_ps(velecsum,velec);
793
794             fscal            = felec;
795
796             fscal            = _mm_andnot_ps(dummy_mask,fscal);
797
798              /* Update vectorial force */
799             fix2             = _mm_macc_ps(dx22,fscal,fix2);
800             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
801             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
802
803             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
804             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
805             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
806
807             /**************************
808              * CALCULATE INTERACTIONS *
809              **************************/
810
811             /* COULOMB ELECTROSTATICS */
812             velec            = _mm_mul_ps(qq23,rinv23);
813             felec            = _mm_mul_ps(velec,rinvsq23);
814
815             /* Update potential sum for this i atom from the interaction with this j atom. */
816             velec            = _mm_andnot_ps(dummy_mask,velec);
817             velecsum         = _mm_add_ps(velecsum,velec);
818
819             fscal            = felec;
820
821             fscal            = _mm_andnot_ps(dummy_mask,fscal);
822
823              /* Update vectorial force */
824             fix2             = _mm_macc_ps(dx23,fscal,fix2);
825             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
826             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
827
828             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
829             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
830             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
831
832             /**************************
833              * CALCULATE INTERACTIONS *
834              **************************/
835
836             /* COULOMB ELECTROSTATICS */
837             velec            = _mm_mul_ps(qq31,rinv31);
838             felec            = _mm_mul_ps(velec,rinvsq31);
839
840             /* Update potential sum for this i atom from the interaction with this j atom. */
841             velec            = _mm_andnot_ps(dummy_mask,velec);
842             velecsum         = _mm_add_ps(velecsum,velec);
843
844             fscal            = felec;
845
846             fscal            = _mm_andnot_ps(dummy_mask,fscal);
847
848              /* Update vectorial force */
849             fix3             = _mm_macc_ps(dx31,fscal,fix3);
850             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
851             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
852
853             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
854             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
855             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
856
857             /**************************
858              * CALCULATE INTERACTIONS *
859              **************************/
860
861             /* COULOMB ELECTROSTATICS */
862             velec            = _mm_mul_ps(qq32,rinv32);
863             felec            = _mm_mul_ps(velec,rinvsq32);
864
865             /* Update potential sum for this i atom from the interaction with this j atom. */
866             velec            = _mm_andnot_ps(dummy_mask,velec);
867             velecsum         = _mm_add_ps(velecsum,velec);
868
869             fscal            = felec;
870
871             fscal            = _mm_andnot_ps(dummy_mask,fscal);
872
873              /* Update vectorial force */
874             fix3             = _mm_macc_ps(dx32,fscal,fix3);
875             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
876             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
877
878             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
879             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
880             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
881
882             /**************************
883              * CALCULATE INTERACTIONS *
884              **************************/
885
886             /* COULOMB ELECTROSTATICS */
887             velec            = _mm_mul_ps(qq33,rinv33);
888             felec            = _mm_mul_ps(velec,rinvsq33);
889
890             /* Update potential sum for this i atom from the interaction with this j atom. */
891             velec            = _mm_andnot_ps(dummy_mask,velec);
892             velecsum         = _mm_add_ps(velecsum,velec);
893
894             fscal            = felec;
895
896             fscal            = _mm_andnot_ps(dummy_mask,fscal);
897
898              /* Update vectorial force */
899             fix3             = _mm_macc_ps(dx33,fscal,fix3);
900             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
901             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
902
903             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
904             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
905             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
906
907             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
908             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
909             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
910             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
911
912             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
913                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
914                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
915
916             /* Inner loop uses 317 flops */
917         }
918
919         /* End of innermost loop */
920
921         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
922                                               f+i_coord_offset,fshift+i_shift_offset);
923
924         ggid                        = gid[iidx];
925         /* Update potential energies */
926         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
927         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
928
929         /* Increment number of inner iterations */
930         inneriter                  += j_index_end - j_index_start;
931
932         /* Outer loop uses 26 flops */
933     }
934
935     /* Increment number of outer iterations */
936     outeriter        += nri;
937
938     /* Update outer/inner flops */
939
940     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*317);
941 }
942 /*
943  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single
944  * Electrostatics interaction: Coulomb
945  * VdW interaction:            LennardJones
946  * Geometry:                   Water4-Water4
947  * Calculate force/pot:        Force
948  */
949 void
950 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single
951                     (t_nblist                    * gmx_restrict       nlist,
952                      rvec                        * gmx_restrict          xx,
953                      rvec                        * gmx_restrict          ff,
954                      t_forcerec                  * gmx_restrict          fr,
955                      t_mdatoms                   * gmx_restrict     mdatoms,
956                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
957                      t_nrnb                      * gmx_restrict        nrnb)
958 {
959     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
960      * just 0 for non-waters.
961      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
962      * jnr indices corresponding to data put in the four positions in the SIMD register.
963      */
964     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
965     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
966     int              jnrA,jnrB,jnrC,jnrD;
967     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
968     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
969     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
970     real             rcutoff_scalar;
971     real             *shiftvec,*fshift,*x,*f;
972     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
973     real             scratch[4*DIM];
974     __m128           fscal,rcutoff,rcutoff2,jidxall;
975     int              vdwioffset0;
976     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
977     int              vdwioffset1;
978     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
979     int              vdwioffset2;
980     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
981     int              vdwioffset3;
982     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
983     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
984     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
985     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
986     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
987     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
988     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
989     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
990     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
991     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
992     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
993     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
994     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
995     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
996     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
997     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
998     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
999     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1000     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1001     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1002     real             *charge;
1003     int              nvdwtype;
1004     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1005     int              *vdwtype;
1006     real             *vdwparam;
1007     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1008     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1009     __m128           dummy_mask,cutoff_mask;
1010     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1011     __m128           one     = _mm_set1_ps(1.0);
1012     __m128           two     = _mm_set1_ps(2.0);
1013     x                = xx[0];
1014     f                = ff[0];
1015
1016     nri              = nlist->nri;
1017     iinr             = nlist->iinr;
1018     jindex           = nlist->jindex;
1019     jjnr             = nlist->jjnr;
1020     shiftidx         = nlist->shift;
1021     gid              = nlist->gid;
1022     shiftvec         = fr->shift_vec[0];
1023     fshift           = fr->fshift[0];
1024     facel            = _mm_set1_ps(fr->epsfac);
1025     charge           = mdatoms->chargeA;
1026     nvdwtype         = fr->ntype;
1027     vdwparam         = fr->nbfp;
1028     vdwtype          = mdatoms->typeA;
1029
1030     /* Setup water-specific parameters */
1031     inr              = nlist->iinr[0];
1032     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1033     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1034     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1035     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1036
1037     jq1              = _mm_set1_ps(charge[inr+1]);
1038     jq2              = _mm_set1_ps(charge[inr+2]);
1039     jq3              = _mm_set1_ps(charge[inr+3]);
1040     vdwjidx0A        = 2*vdwtype[inr+0];
1041     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1042     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1043     qq11             = _mm_mul_ps(iq1,jq1);
1044     qq12             = _mm_mul_ps(iq1,jq2);
1045     qq13             = _mm_mul_ps(iq1,jq3);
1046     qq21             = _mm_mul_ps(iq2,jq1);
1047     qq22             = _mm_mul_ps(iq2,jq2);
1048     qq23             = _mm_mul_ps(iq2,jq3);
1049     qq31             = _mm_mul_ps(iq3,jq1);
1050     qq32             = _mm_mul_ps(iq3,jq2);
1051     qq33             = _mm_mul_ps(iq3,jq3);
1052
1053     /* Avoid stupid compiler warnings */
1054     jnrA = jnrB = jnrC = jnrD = 0;
1055     j_coord_offsetA = 0;
1056     j_coord_offsetB = 0;
1057     j_coord_offsetC = 0;
1058     j_coord_offsetD = 0;
1059
1060     outeriter        = 0;
1061     inneriter        = 0;
1062
1063     for(iidx=0;iidx<4*DIM;iidx++)
1064     {
1065         scratch[iidx] = 0.0;
1066     }
1067
1068     /* Start outer loop over neighborlists */
1069     for(iidx=0; iidx<nri; iidx++)
1070     {
1071         /* Load shift vector for this list */
1072         i_shift_offset   = DIM*shiftidx[iidx];
1073
1074         /* Load limits for loop over neighbors */
1075         j_index_start    = jindex[iidx];
1076         j_index_end      = jindex[iidx+1];
1077
1078         /* Get outer coordinate index */
1079         inr              = iinr[iidx];
1080         i_coord_offset   = DIM*inr;
1081
1082         /* Load i particle coords and add shift vector */
1083         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1084                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1085
1086         fix0             = _mm_setzero_ps();
1087         fiy0             = _mm_setzero_ps();
1088         fiz0             = _mm_setzero_ps();
1089         fix1             = _mm_setzero_ps();
1090         fiy1             = _mm_setzero_ps();
1091         fiz1             = _mm_setzero_ps();
1092         fix2             = _mm_setzero_ps();
1093         fiy2             = _mm_setzero_ps();
1094         fiz2             = _mm_setzero_ps();
1095         fix3             = _mm_setzero_ps();
1096         fiy3             = _mm_setzero_ps();
1097         fiz3             = _mm_setzero_ps();
1098
1099         /* Start inner kernel loop */
1100         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1101         {
1102
1103             /* Get j neighbor index, and coordinate index */
1104             jnrA             = jjnr[jidx];
1105             jnrB             = jjnr[jidx+1];
1106             jnrC             = jjnr[jidx+2];
1107             jnrD             = jjnr[jidx+3];
1108             j_coord_offsetA  = DIM*jnrA;
1109             j_coord_offsetB  = DIM*jnrB;
1110             j_coord_offsetC  = DIM*jnrC;
1111             j_coord_offsetD  = DIM*jnrD;
1112
1113             /* load j atom coordinates */
1114             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1115                                               x+j_coord_offsetC,x+j_coord_offsetD,
1116                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1117                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1118
1119             /* Calculate displacement vector */
1120             dx00             = _mm_sub_ps(ix0,jx0);
1121             dy00             = _mm_sub_ps(iy0,jy0);
1122             dz00             = _mm_sub_ps(iz0,jz0);
1123             dx11             = _mm_sub_ps(ix1,jx1);
1124             dy11             = _mm_sub_ps(iy1,jy1);
1125             dz11             = _mm_sub_ps(iz1,jz1);
1126             dx12             = _mm_sub_ps(ix1,jx2);
1127             dy12             = _mm_sub_ps(iy1,jy2);
1128             dz12             = _mm_sub_ps(iz1,jz2);
1129             dx13             = _mm_sub_ps(ix1,jx3);
1130             dy13             = _mm_sub_ps(iy1,jy3);
1131             dz13             = _mm_sub_ps(iz1,jz3);
1132             dx21             = _mm_sub_ps(ix2,jx1);
1133             dy21             = _mm_sub_ps(iy2,jy1);
1134             dz21             = _mm_sub_ps(iz2,jz1);
1135             dx22             = _mm_sub_ps(ix2,jx2);
1136             dy22             = _mm_sub_ps(iy2,jy2);
1137             dz22             = _mm_sub_ps(iz2,jz2);
1138             dx23             = _mm_sub_ps(ix2,jx3);
1139             dy23             = _mm_sub_ps(iy2,jy3);
1140             dz23             = _mm_sub_ps(iz2,jz3);
1141             dx31             = _mm_sub_ps(ix3,jx1);
1142             dy31             = _mm_sub_ps(iy3,jy1);
1143             dz31             = _mm_sub_ps(iz3,jz1);
1144             dx32             = _mm_sub_ps(ix3,jx2);
1145             dy32             = _mm_sub_ps(iy3,jy2);
1146             dz32             = _mm_sub_ps(iz3,jz2);
1147             dx33             = _mm_sub_ps(ix3,jx3);
1148             dy33             = _mm_sub_ps(iy3,jy3);
1149             dz33             = _mm_sub_ps(iz3,jz3);
1150
1151             /* Calculate squared distance and things based on it */
1152             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1153             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1154             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1155             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1156             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1157             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1158             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1159             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1160             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1161             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1162
1163             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1164             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1165             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1166             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1167             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1168             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1169             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1170             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1171             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1172
1173             rinvsq00         = gmx_mm_inv_ps(rsq00);
1174             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1175             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1176             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1177             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1178             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1179             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1180             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1181             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1182             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1183
1184             fjx0             = _mm_setzero_ps();
1185             fjy0             = _mm_setzero_ps();
1186             fjz0             = _mm_setzero_ps();
1187             fjx1             = _mm_setzero_ps();
1188             fjy1             = _mm_setzero_ps();
1189             fjz1             = _mm_setzero_ps();
1190             fjx2             = _mm_setzero_ps();
1191             fjy2             = _mm_setzero_ps();
1192             fjz2             = _mm_setzero_ps();
1193             fjx3             = _mm_setzero_ps();
1194             fjy3             = _mm_setzero_ps();
1195             fjz3             = _mm_setzero_ps();
1196
1197             /**************************
1198              * CALCULATE INTERACTIONS *
1199              **************************/
1200
1201             /* LENNARD-JONES DISPERSION/REPULSION */
1202
1203             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1204             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1205
1206             fscal            = fvdw;
1207
1208              /* Update vectorial force */
1209             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1210             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1211             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1212
1213             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1214             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1215             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1216
1217             /**************************
1218              * CALCULATE INTERACTIONS *
1219              **************************/
1220
1221             /* COULOMB ELECTROSTATICS */
1222             velec            = _mm_mul_ps(qq11,rinv11);
1223             felec            = _mm_mul_ps(velec,rinvsq11);
1224
1225             fscal            = felec;
1226
1227              /* Update vectorial force */
1228             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1229             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1230             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1231
1232             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1233             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1234             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1235
1236             /**************************
1237              * CALCULATE INTERACTIONS *
1238              **************************/
1239
1240             /* COULOMB ELECTROSTATICS */
1241             velec            = _mm_mul_ps(qq12,rinv12);
1242             felec            = _mm_mul_ps(velec,rinvsq12);
1243
1244             fscal            = felec;
1245
1246              /* Update vectorial force */
1247             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1248             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1249             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1250
1251             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1252             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1253             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1254
1255             /**************************
1256              * CALCULATE INTERACTIONS *
1257              **************************/
1258
1259             /* COULOMB ELECTROSTATICS */
1260             velec            = _mm_mul_ps(qq13,rinv13);
1261             felec            = _mm_mul_ps(velec,rinvsq13);
1262
1263             fscal            = felec;
1264
1265              /* Update vectorial force */
1266             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1267             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1268             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1269
1270             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1271             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1272             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1273
1274             /**************************
1275              * CALCULATE INTERACTIONS *
1276              **************************/
1277
1278             /* COULOMB ELECTROSTATICS */
1279             velec            = _mm_mul_ps(qq21,rinv21);
1280             felec            = _mm_mul_ps(velec,rinvsq21);
1281
1282             fscal            = felec;
1283
1284              /* Update vectorial force */
1285             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1286             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1287             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1288
1289             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1290             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1291             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1292
1293             /**************************
1294              * CALCULATE INTERACTIONS *
1295              **************************/
1296
1297             /* COULOMB ELECTROSTATICS */
1298             velec            = _mm_mul_ps(qq22,rinv22);
1299             felec            = _mm_mul_ps(velec,rinvsq22);
1300
1301             fscal            = felec;
1302
1303              /* Update vectorial force */
1304             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1305             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1306             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1307
1308             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1309             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1310             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1311
1312             /**************************
1313              * CALCULATE INTERACTIONS *
1314              **************************/
1315
1316             /* COULOMB ELECTROSTATICS */
1317             velec            = _mm_mul_ps(qq23,rinv23);
1318             felec            = _mm_mul_ps(velec,rinvsq23);
1319
1320             fscal            = felec;
1321
1322              /* Update vectorial force */
1323             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1324             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1325             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1326
1327             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1328             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1329             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1330
1331             /**************************
1332              * CALCULATE INTERACTIONS *
1333              **************************/
1334
1335             /* COULOMB ELECTROSTATICS */
1336             velec            = _mm_mul_ps(qq31,rinv31);
1337             felec            = _mm_mul_ps(velec,rinvsq31);
1338
1339             fscal            = felec;
1340
1341              /* Update vectorial force */
1342             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1343             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1344             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1345
1346             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1347             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1348             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1349
1350             /**************************
1351              * CALCULATE INTERACTIONS *
1352              **************************/
1353
1354             /* COULOMB ELECTROSTATICS */
1355             velec            = _mm_mul_ps(qq32,rinv32);
1356             felec            = _mm_mul_ps(velec,rinvsq32);
1357
1358             fscal            = felec;
1359
1360              /* Update vectorial force */
1361             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1362             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1363             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1364
1365             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1366             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1367             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1368
1369             /**************************
1370              * CALCULATE INTERACTIONS *
1371              **************************/
1372
1373             /* COULOMB ELECTROSTATICS */
1374             velec            = _mm_mul_ps(qq33,rinv33);
1375             felec            = _mm_mul_ps(velec,rinvsq33);
1376
1377             fscal            = felec;
1378
1379              /* Update vectorial force */
1380             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1381             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1382             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1383
1384             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1385             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1386             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1387
1388             fjptrA             = f+j_coord_offsetA;
1389             fjptrB             = f+j_coord_offsetB;
1390             fjptrC             = f+j_coord_offsetC;
1391             fjptrD             = f+j_coord_offsetD;
1392
1393             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1394                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1395                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1396
1397             /* Inner loop uses 303 flops */
1398         }
1399
1400         if(jidx<j_index_end)
1401         {
1402
1403             /* Get j neighbor index, and coordinate index */
1404             jnrlistA         = jjnr[jidx];
1405             jnrlistB         = jjnr[jidx+1];
1406             jnrlistC         = jjnr[jidx+2];
1407             jnrlistD         = jjnr[jidx+3];
1408             /* Sign of each element will be negative for non-real atoms.
1409              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1410              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1411              */
1412             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1413             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1414             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1415             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1416             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1417             j_coord_offsetA  = DIM*jnrA;
1418             j_coord_offsetB  = DIM*jnrB;
1419             j_coord_offsetC  = DIM*jnrC;
1420             j_coord_offsetD  = DIM*jnrD;
1421
1422             /* load j atom coordinates */
1423             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1424                                               x+j_coord_offsetC,x+j_coord_offsetD,
1425                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1426                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1427
1428             /* Calculate displacement vector */
1429             dx00             = _mm_sub_ps(ix0,jx0);
1430             dy00             = _mm_sub_ps(iy0,jy0);
1431             dz00             = _mm_sub_ps(iz0,jz0);
1432             dx11             = _mm_sub_ps(ix1,jx1);
1433             dy11             = _mm_sub_ps(iy1,jy1);
1434             dz11             = _mm_sub_ps(iz1,jz1);
1435             dx12             = _mm_sub_ps(ix1,jx2);
1436             dy12             = _mm_sub_ps(iy1,jy2);
1437             dz12             = _mm_sub_ps(iz1,jz2);
1438             dx13             = _mm_sub_ps(ix1,jx3);
1439             dy13             = _mm_sub_ps(iy1,jy3);
1440             dz13             = _mm_sub_ps(iz1,jz3);
1441             dx21             = _mm_sub_ps(ix2,jx1);
1442             dy21             = _mm_sub_ps(iy2,jy1);
1443             dz21             = _mm_sub_ps(iz2,jz1);
1444             dx22             = _mm_sub_ps(ix2,jx2);
1445             dy22             = _mm_sub_ps(iy2,jy2);
1446             dz22             = _mm_sub_ps(iz2,jz2);
1447             dx23             = _mm_sub_ps(ix2,jx3);
1448             dy23             = _mm_sub_ps(iy2,jy3);
1449             dz23             = _mm_sub_ps(iz2,jz3);
1450             dx31             = _mm_sub_ps(ix3,jx1);
1451             dy31             = _mm_sub_ps(iy3,jy1);
1452             dz31             = _mm_sub_ps(iz3,jz1);
1453             dx32             = _mm_sub_ps(ix3,jx2);
1454             dy32             = _mm_sub_ps(iy3,jy2);
1455             dz32             = _mm_sub_ps(iz3,jz2);
1456             dx33             = _mm_sub_ps(ix3,jx3);
1457             dy33             = _mm_sub_ps(iy3,jy3);
1458             dz33             = _mm_sub_ps(iz3,jz3);
1459
1460             /* Calculate squared distance and things based on it */
1461             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1462             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1463             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1464             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1465             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1466             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1467             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1468             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1469             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1470             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1471
1472             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1473             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1474             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1475             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1476             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1477             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1478             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1479             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1480             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1481
1482             rinvsq00         = gmx_mm_inv_ps(rsq00);
1483             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1484             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1485             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1486             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1487             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1488             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1489             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1490             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1491             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1492
1493             fjx0             = _mm_setzero_ps();
1494             fjy0             = _mm_setzero_ps();
1495             fjz0             = _mm_setzero_ps();
1496             fjx1             = _mm_setzero_ps();
1497             fjy1             = _mm_setzero_ps();
1498             fjz1             = _mm_setzero_ps();
1499             fjx2             = _mm_setzero_ps();
1500             fjy2             = _mm_setzero_ps();
1501             fjz2             = _mm_setzero_ps();
1502             fjx3             = _mm_setzero_ps();
1503             fjy3             = _mm_setzero_ps();
1504             fjz3             = _mm_setzero_ps();
1505
1506             /**************************
1507              * CALCULATE INTERACTIONS *
1508              **************************/
1509
1510             /* LENNARD-JONES DISPERSION/REPULSION */
1511
1512             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1513             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1514
1515             fscal            = fvdw;
1516
1517             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1518
1519              /* Update vectorial force */
1520             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1521             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1522             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1523
1524             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1525             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1526             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1527
1528             /**************************
1529              * CALCULATE INTERACTIONS *
1530              **************************/
1531
1532             /* COULOMB ELECTROSTATICS */
1533             velec            = _mm_mul_ps(qq11,rinv11);
1534             felec            = _mm_mul_ps(velec,rinvsq11);
1535
1536             fscal            = felec;
1537
1538             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1539
1540              /* Update vectorial force */
1541             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1542             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1543             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1544
1545             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1546             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1547             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1548
1549             /**************************
1550              * CALCULATE INTERACTIONS *
1551              **************************/
1552
1553             /* COULOMB ELECTROSTATICS */
1554             velec            = _mm_mul_ps(qq12,rinv12);
1555             felec            = _mm_mul_ps(velec,rinvsq12);
1556
1557             fscal            = felec;
1558
1559             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1560
1561              /* Update vectorial force */
1562             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1563             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1564             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1565
1566             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1567             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1568             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1569
1570             /**************************
1571              * CALCULATE INTERACTIONS *
1572              **************************/
1573
1574             /* COULOMB ELECTROSTATICS */
1575             velec            = _mm_mul_ps(qq13,rinv13);
1576             felec            = _mm_mul_ps(velec,rinvsq13);
1577
1578             fscal            = felec;
1579
1580             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1581
1582              /* Update vectorial force */
1583             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1584             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1585             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1586
1587             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1588             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1589             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1590
1591             /**************************
1592              * CALCULATE INTERACTIONS *
1593              **************************/
1594
1595             /* COULOMB ELECTROSTATICS */
1596             velec            = _mm_mul_ps(qq21,rinv21);
1597             felec            = _mm_mul_ps(velec,rinvsq21);
1598
1599             fscal            = felec;
1600
1601             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1602
1603              /* Update vectorial force */
1604             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1605             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1606             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1607
1608             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1609             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1610             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1611
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             /* COULOMB ELECTROSTATICS */
1617             velec            = _mm_mul_ps(qq22,rinv22);
1618             felec            = _mm_mul_ps(velec,rinvsq22);
1619
1620             fscal            = felec;
1621
1622             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1623
1624              /* Update vectorial force */
1625             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1626             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1627             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1628
1629             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1630             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1631             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1632
1633             /**************************
1634              * CALCULATE INTERACTIONS *
1635              **************************/
1636
1637             /* COULOMB ELECTROSTATICS */
1638             velec            = _mm_mul_ps(qq23,rinv23);
1639             felec            = _mm_mul_ps(velec,rinvsq23);
1640
1641             fscal            = felec;
1642
1643             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1644
1645              /* Update vectorial force */
1646             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1647             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1648             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1649
1650             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1651             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1652             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1653
1654             /**************************
1655              * CALCULATE INTERACTIONS *
1656              **************************/
1657
1658             /* COULOMB ELECTROSTATICS */
1659             velec            = _mm_mul_ps(qq31,rinv31);
1660             felec            = _mm_mul_ps(velec,rinvsq31);
1661
1662             fscal            = felec;
1663
1664             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1665
1666              /* Update vectorial force */
1667             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1668             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1669             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1670
1671             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1672             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1673             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1674
1675             /**************************
1676              * CALCULATE INTERACTIONS *
1677              **************************/
1678
1679             /* COULOMB ELECTROSTATICS */
1680             velec            = _mm_mul_ps(qq32,rinv32);
1681             felec            = _mm_mul_ps(velec,rinvsq32);
1682
1683             fscal            = felec;
1684
1685             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1686
1687              /* Update vectorial force */
1688             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1689             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1690             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1691
1692             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1693             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1694             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1695
1696             /**************************
1697              * CALCULATE INTERACTIONS *
1698              **************************/
1699
1700             /* COULOMB ELECTROSTATICS */
1701             velec            = _mm_mul_ps(qq33,rinv33);
1702             felec            = _mm_mul_ps(velec,rinvsq33);
1703
1704             fscal            = felec;
1705
1706             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1707
1708              /* Update vectorial force */
1709             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1710             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1711             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1712
1713             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1714             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1715             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1716
1717             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1718             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1719             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1720             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1721
1722             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1723                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1724                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1725
1726             /* Inner loop uses 303 flops */
1727         }
1728
1729         /* End of innermost loop */
1730
1731         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1732                                               f+i_coord_offset,fshift+i_shift_offset);
1733
1734         /* Increment number of inner iterations */
1735         inneriter                  += j_index_end - j_index_start;
1736
1737         /* Outer loop uses 24 flops */
1738     }
1739
1740     /* Increment number of outer iterations */
1741     outeriter        += nri;
1742
1743     /* Update outer/inner flops */
1744
1745     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
1746 }