Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCoul_VdwLJ_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_single
52  * Electrostatics interaction: Coulomb
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128           dummy_mask,cutoff_mask;
113     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114     __m128           one     = _mm_set1_ps(1.0);
115     __m128           two     = _mm_set1_ps(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm_set1_ps(fr->epsfac);
128     charge           = mdatoms->chargeA;
129     nvdwtype         = fr->ntype;
130     vdwparam         = fr->nbfp;
131     vdwtype          = mdatoms->typeA;
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
136     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
139
140     jq0              = _mm_set1_ps(charge[inr+0]);
141     jq1              = _mm_set1_ps(charge[inr+1]);
142     jq2              = _mm_set1_ps(charge[inr+2]);
143     vdwjidx0A        = 2*vdwtype[inr+0];
144     qq00             = _mm_mul_ps(iq0,jq0);
145     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
146     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
147     qq01             = _mm_mul_ps(iq0,jq1);
148     qq02             = _mm_mul_ps(iq0,jq2);
149     qq10             = _mm_mul_ps(iq1,jq0);
150     qq11             = _mm_mul_ps(iq1,jq1);
151     qq12             = _mm_mul_ps(iq1,jq2);
152     qq20             = _mm_mul_ps(iq2,jq0);
153     qq21             = _mm_mul_ps(iq2,jq1);
154     qq22             = _mm_mul_ps(iq2,jq2);
155
156     /* Avoid stupid compiler warnings */
157     jnrA = jnrB = jnrC = jnrD = 0;
158     j_coord_offsetA = 0;
159     j_coord_offsetB = 0;
160     j_coord_offsetC = 0;
161     j_coord_offsetD = 0;
162
163     outeriter        = 0;
164     inneriter        = 0;
165
166     for(iidx=0;iidx<4*DIM;iidx++)
167     {
168         scratch[iidx] = 0.0;
169     }
170
171     /* Start outer loop over neighborlists */
172     for(iidx=0; iidx<nri; iidx++)
173     {
174         /* Load shift vector for this list */
175         i_shift_offset   = DIM*shiftidx[iidx];
176
177         /* Load limits for loop over neighbors */
178         j_index_start    = jindex[iidx];
179         j_index_end      = jindex[iidx+1];
180
181         /* Get outer coordinate index */
182         inr              = iinr[iidx];
183         i_coord_offset   = DIM*inr;
184
185         /* Load i particle coords and add shift vector */
186         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
187                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
188
189         fix0             = _mm_setzero_ps();
190         fiy0             = _mm_setzero_ps();
191         fiz0             = _mm_setzero_ps();
192         fix1             = _mm_setzero_ps();
193         fiy1             = _mm_setzero_ps();
194         fiz1             = _mm_setzero_ps();
195         fix2             = _mm_setzero_ps();
196         fiy2             = _mm_setzero_ps();
197         fiz2             = _mm_setzero_ps();
198
199         /* Reset potential sums */
200         velecsum         = _mm_setzero_ps();
201         vvdwsum          = _mm_setzero_ps();
202
203         /* Start inner kernel loop */
204         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
205         {
206
207             /* Get j neighbor index, and coordinate index */
208             jnrA             = jjnr[jidx];
209             jnrB             = jjnr[jidx+1];
210             jnrC             = jjnr[jidx+2];
211             jnrD             = jjnr[jidx+3];
212             j_coord_offsetA  = DIM*jnrA;
213             j_coord_offsetB  = DIM*jnrB;
214             j_coord_offsetC  = DIM*jnrC;
215             j_coord_offsetD  = DIM*jnrD;
216
217             /* load j atom coordinates */
218             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
219                                               x+j_coord_offsetC,x+j_coord_offsetD,
220                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
221
222             /* Calculate displacement vector */
223             dx00             = _mm_sub_ps(ix0,jx0);
224             dy00             = _mm_sub_ps(iy0,jy0);
225             dz00             = _mm_sub_ps(iz0,jz0);
226             dx01             = _mm_sub_ps(ix0,jx1);
227             dy01             = _mm_sub_ps(iy0,jy1);
228             dz01             = _mm_sub_ps(iz0,jz1);
229             dx02             = _mm_sub_ps(ix0,jx2);
230             dy02             = _mm_sub_ps(iy0,jy2);
231             dz02             = _mm_sub_ps(iz0,jz2);
232             dx10             = _mm_sub_ps(ix1,jx0);
233             dy10             = _mm_sub_ps(iy1,jy0);
234             dz10             = _mm_sub_ps(iz1,jz0);
235             dx11             = _mm_sub_ps(ix1,jx1);
236             dy11             = _mm_sub_ps(iy1,jy1);
237             dz11             = _mm_sub_ps(iz1,jz1);
238             dx12             = _mm_sub_ps(ix1,jx2);
239             dy12             = _mm_sub_ps(iy1,jy2);
240             dz12             = _mm_sub_ps(iz1,jz2);
241             dx20             = _mm_sub_ps(ix2,jx0);
242             dy20             = _mm_sub_ps(iy2,jy0);
243             dz20             = _mm_sub_ps(iz2,jz0);
244             dx21             = _mm_sub_ps(ix2,jx1);
245             dy21             = _mm_sub_ps(iy2,jy1);
246             dz21             = _mm_sub_ps(iz2,jz1);
247             dx22             = _mm_sub_ps(ix2,jx2);
248             dy22             = _mm_sub_ps(iy2,jy2);
249             dz22             = _mm_sub_ps(iz2,jz2);
250
251             /* Calculate squared distance and things based on it */
252             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
253             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
254             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
255             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
256             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
257             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
258             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
259             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
260             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
261
262             rinv00           = gmx_mm_invsqrt_ps(rsq00);
263             rinv01           = gmx_mm_invsqrt_ps(rsq01);
264             rinv02           = gmx_mm_invsqrt_ps(rsq02);
265             rinv10           = gmx_mm_invsqrt_ps(rsq10);
266             rinv11           = gmx_mm_invsqrt_ps(rsq11);
267             rinv12           = gmx_mm_invsqrt_ps(rsq12);
268             rinv20           = gmx_mm_invsqrt_ps(rsq20);
269             rinv21           = gmx_mm_invsqrt_ps(rsq21);
270             rinv22           = gmx_mm_invsqrt_ps(rsq22);
271
272             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
273             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
274             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
275             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
276             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
277             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
278             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
279             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
280             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
281
282             fjx0             = _mm_setzero_ps();
283             fjy0             = _mm_setzero_ps();
284             fjz0             = _mm_setzero_ps();
285             fjx1             = _mm_setzero_ps();
286             fjy1             = _mm_setzero_ps();
287             fjz1             = _mm_setzero_ps();
288             fjx2             = _mm_setzero_ps();
289             fjy2             = _mm_setzero_ps();
290             fjz2             = _mm_setzero_ps();
291
292             /**************************
293              * CALCULATE INTERACTIONS *
294              **************************/
295
296             /* COULOMB ELECTROSTATICS */
297             velec            = _mm_mul_ps(qq00,rinv00);
298             felec            = _mm_mul_ps(velec,rinvsq00);
299
300             /* LENNARD-JONES DISPERSION/REPULSION */
301
302             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
303             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
304             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
305             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
306             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
307
308             /* Update potential sum for this i atom from the interaction with this j atom. */
309             velecsum         = _mm_add_ps(velecsum,velec);
310             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
311
312             fscal            = _mm_add_ps(felec,fvdw);
313
314              /* Update vectorial force */
315             fix0             = _mm_macc_ps(dx00,fscal,fix0);
316             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
317             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
318
319             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
320             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
321             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
322
323             /**************************
324              * CALCULATE INTERACTIONS *
325              **************************/
326
327             /* COULOMB ELECTROSTATICS */
328             velec            = _mm_mul_ps(qq01,rinv01);
329             felec            = _mm_mul_ps(velec,rinvsq01);
330
331             /* Update potential sum for this i atom from the interaction with this j atom. */
332             velecsum         = _mm_add_ps(velecsum,velec);
333
334             fscal            = felec;
335
336              /* Update vectorial force */
337             fix0             = _mm_macc_ps(dx01,fscal,fix0);
338             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
339             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
340
341             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
342             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
343             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
344
345             /**************************
346              * CALCULATE INTERACTIONS *
347              **************************/
348
349             /* COULOMB ELECTROSTATICS */
350             velec            = _mm_mul_ps(qq02,rinv02);
351             felec            = _mm_mul_ps(velec,rinvsq02);
352
353             /* Update potential sum for this i atom from the interaction with this j atom. */
354             velecsum         = _mm_add_ps(velecsum,velec);
355
356             fscal            = felec;
357
358              /* Update vectorial force */
359             fix0             = _mm_macc_ps(dx02,fscal,fix0);
360             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
361             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
362
363             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
364             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
365             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
366
367             /**************************
368              * CALCULATE INTERACTIONS *
369              **************************/
370
371             /* COULOMB ELECTROSTATICS */
372             velec            = _mm_mul_ps(qq10,rinv10);
373             felec            = _mm_mul_ps(velec,rinvsq10);
374
375             /* Update potential sum for this i atom from the interaction with this j atom. */
376             velecsum         = _mm_add_ps(velecsum,velec);
377
378             fscal            = felec;
379
380              /* Update vectorial force */
381             fix1             = _mm_macc_ps(dx10,fscal,fix1);
382             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
383             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
384
385             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
386             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
387             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
388
389             /**************************
390              * CALCULATE INTERACTIONS *
391              **************************/
392
393             /* COULOMB ELECTROSTATICS */
394             velec            = _mm_mul_ps(qq11,rinv11);
395             felec            = _mm_mul_ps(velec,rinvsq11);
396
397             /* Update potential sum for this i atom from the interaction with this j atom. */
398             velecsum         = _mm_add_ps(velecsum,velec);
399
400             fscal            = felec;
401
402              /* Update vectorial force */
403             fix1             = _mm_macc_ps(dx11,fscal,fix1);
404             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
405             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
406
407             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
408             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
409             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
410
411             /**************************
412              * CALCULATE INTERACTIONS *
413              **************************/
414
415             /* COULOMB ELECTROSTATICS */
416             velec            = _mm_mul_ps(qq12,rinv12);
417             felec            = _mm_mul_ps(velec,rinvsq12);
418
419             /* Update potential sum for this i atom from the interaction with this j atom. */
420             velecsum         = _mm_add_ps(velecsum,velec);
421
422             fscal            = felec;
423
424              /* Update vectorial force */
425             fix1             = _mm_macc_ps(dx12,fscal,fix1);
426             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
427             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
428
429             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
430             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
431             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
432
433             /**************************
434              * CALCULATE INTERACTIONS *
435              **************************/
436
437             /* COULOMB ELECTROSTATICS */
438             velec            = _mm_mul_ps(qq20,rinv20);
439             felec            = _mm_mul_ps(velec,rinvsq20);
440
441             /* Update potential sum for this i atom from the interaction with this j atom. */
442             velecsum         = _mm_add_ps(velecsum,velec);
443
444             fscal            = felec;
445
446              /* Update vectorial force */
447             fix2             = _mm_macc_ps(dx20,fscal,fix2);
448             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
449             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
450
451             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
452             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
453             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
454
455             /**************************
456              * CALCULATE INTERACTIONS *
457              **************************/
458
459             /* COULOMB ELECTROSTATICS */
460             velec            = _mm_mul_ps(qq21,rinv21);
461             felec            = _mm_mul_ps(velec,rinvsq21);
462
463             /* Update potential sum for this i atom from the interaction with this j atom. */
464             velecsum         = _mm_add_ps(velecsum,velec);
465
466             fscal            = felec;
467
468              /* Update vectorial force */
469             fix2             = _mm_macc_ps(dx21,fscal,fix2);
470             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
471             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
472
473             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
474             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
475             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             /* COULOMB ELECTROSTATICS */
482             velec            = _mm_mul_ps(qq22,rinv22);
483             felec            = _mm_mul_ps(velec,rinvsq22);
484
485             /* Update potential sum for this i atom from the interaction with this j atom. */
486             velecsum         = _mm_add_ps(velecsum,velec);
487
488             fscal            = felec;
489
490              /* Update vectorial force */
491             fix2             = _mm_macc_ps(dx22,fscal,fix2);
492             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
493             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
494
495             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
496             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
497             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
498
499             fjptrA             = f+j_coord_offsetA;
500             fjptrB             = f+j_coord_offsetB;
501             fjptrC             = f+j_coord_offsetC;
502             fjptrD             = f+j_coord_offsetD;
503
504             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
505                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
506
507             /* Inner loop uses 291 flops */
508         }
509
510         if(jidx<j_index_end)
511         {
512
513             /* Get j neighbor index, and coordinate index */
514             jnrlistA         = jjnr[jidx];
515             jnrlistB         = jjnr[jidx+1];
516             jnrlistC         = jjnr[jidx+2];
517             jnrlistD         = jjnr[jidx+3];
518             /* Sign of each element will be negative for non-real atoms.
519              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
520              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
521              */
522             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
523             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
524             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
525             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
526             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
527             j_coord_offsetA  = DIM*jnrA;
528             j_coord_offsetB  = DIM*jnrB;
529             j_coord_offsetC  = DIM*jnrC;
530             j_coord_offsetD  = DIM*jnrD;
531
532             /* load j atom coordinates */
533             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
534                                               x+j_coord_offsetC,x+j_coord_offsetD,
535                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
536
537             /* Calculate displacement vector */
538             dx00             = _mm_sub_ps(ix0,jx0);
539             dy00             = _mm_sub_ps(iy0,jy0);
540             dz00             = _mm_sub_ps(iz0,jz0);
541             dx01             = _mm_sub_ps(ix0,jx1);
542             dy01             = _mm_sub_ps(iy0,jy1);
543             dz01             = _mm_sub_ps(iz0,jz1);
544             dx02             = _mm_sub_ps(ix0,jx2);
545             dy02             = _mm_sub_ps(iy0,jy2);
546             dz02             = _mm_sub_ps(iz0,jz2);
547             dx10             = _mm_sub_ps(ix1,jx0);
548             dy10             = _mm_sub_ps(iy1,jy0);
549             dz10             = _mm_sub_ps(iz1,jz0);
550             dx11             = _mm_sub_ps(ix1,jx1);
551             dy11             = _mm_sub_ps(iy1,jy1);
552             dz11             = _mm_sub_ps(iz1,jz1);
553             dx12             = _mm_sub_ps(ix1,jx2);
554             dy12             = _mm_sub_ps(iy1,jy2);
555             dz12             = _mm_sub_ps(iz1,jz2);
556             dx20             = _mm_sub_ps(ix2,jx0);
557             dy20             = _mm_sub_ps(iy2,jy0);
558             dz20             = _mm_sub_ps(iz2,jz0);
559             dx21             = _mm_sub_ps(ix2,jx1);
560             dy21             = _mm_sub_ps(iy2,jy1);
561             dz21             = _mm_sub_ps(iz2,jz1);
562             dx22             = _mm_sub_ps(ix2,jx2);
563             dy22             = _mm_sub_ps(iy2,jy2);
564             dz22             = _mm_sub_ps(iz2,jz2);
565
566             /* Calculate squared distance and things based on it */
567             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
568             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
569             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
570             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
571             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
572             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
573             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
574             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
575             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
576
577             rinv00           = gmx_mm_invsqrt_ps(rsq00);
578             rinv01           = gmx_mm_invsqrt_ps(rsq01);
579             rinv02           = gmx_mm_invsqrt_ps(rsq02);
580             rinv10           = gmx_mm_invsqrt_ps(rsq10);
581             rinv11           = gmx_mm_invsqrt_ps(rsq11);
582             rinv12           = gmx_mm_invsqrt_ps(rsq12);
583             rinv20           = gmx_mm_invsqrt_ps(rsq20);
584             rinv21           = gmx_mm_invsqrt_ps(rsq21);
585             rinv22           = gmx_mm_invsqrt_ps(rsq22);
586
587             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
588             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
589             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
590             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
591             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
592             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
593             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
594             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
595             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
596
597             fjx0             = _mm_setzero_ps();
598             fjy0             = _mm_setzero_ps();
599             fjz0             = _mm_setzero_ps();
600             fjx1             = _mm_setzero_ps();
601             fjy1             = _mm_setzero_ps();
602             fjz1             = _mm_setzero_ps();
603             fjx2             = _mm_setzero_ps();
604             fjy2             = _mm_setzero_ps();
605             fjz2             = _mm_setzero_ps();
606
607             /**************************
608              * CALCULATE INTERACTIONS *
609              **************************/
610
611             /* COULOMB ELECTROSTATICS */
612             velec            = _mm_mul_ps(qq00,rinv00);
613             felec            = _mm_mul_ps(velec,rinvsq00);
614
615             /* LENNARD-JONES DISPERSION/REPULSION */
616
617             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
618             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
619             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
620             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
621             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
622
623             /* Update potential sum for this i atom from the interaction with this j atom. */
624             velec            = _mm_andnot_ps(dummy_mask,velec);
625             velecsum         = _mm_add_ps(velecsum,velec);
626             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
627             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
628
629             fscal            = _mm_add_ps(felec,fvdw);
630
631             fscal            = _mm_andnot_ps(dummy_mask,fscal);
632
633              /* Update vectorial force */
634             fix0             = _mm_macc_ps(dx00,fscal,fix0);
635             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
636             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
637
638             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
639             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
640             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
641
642             /**************************
643              * CALCULATE INTERACTIONS *
644              **************************/
645
646             /* COULOMB ELECTROSTATICS */
647             velec            = _mm_mul_ps(qq01,rinv01);
648             felec            = _mm_mul_ps(velec,rinvsq01);
649
650             /* Update potential sum for this i atom from the interaction with this j atom. */
651             velec            = _mm_andnot_ps(dummy_mask,velec);
652             velecsum         = _mm_add_ps(velecsum,velec);
653
654             fscal            = felec;
655
656             fscal            = _mm_andnot_ps(dummy_mask,fscal);
657
658              /* Update vectorial force */
659             fix0             = _mm_macc_ps(dx01,fscal,fix0);
660             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
661             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
662
663             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
664             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
665             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
666
667             /**************************
668              * CALCULATE INTERACTIONS *
669              **************************/
670
671             /* COULOMB ELECTROSTATICS */
672             velec            = _mm_mul_ps(qq02,rinv02);
673             felec            = _mm_mul_ps(velec,rinvsq02);
674
675             /* Update potential sum for this i atom from the interaction with this j atom. */
676             velec            = _mm_andnot_ps(dummy_mask,velec);
677             velecsum         = _mm_add_ps(velecsum,velec);
678
679             fscal            = felec;
680
681             fscal            = _mm_andnot_ps(dummy_mask,fscal);
682
683              /* Update vectorial force */
684             fix0             = _mm_macc_ps(dx02,fscal,fix0);
685             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
686             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
687
688             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
689             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
690             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
691
692             /**************************
693              * CALCULATE INTERACTIONS *
694              **************************/
695
696             /* COULOMB ELECTROSTATICS */
697             velec            = _mm_mul_ps(qq10,rinv10);
698             felec            = _mm_mul_ps(velec,rinvsq10);
699
700             /* Update potential sum for this i atom from the interaction with this j atom. */
701             velec            = _mm_andnot_ps(dummy_mask,velec);
702             velecsum         = _mm_add_ps(velecsum,velec);
703
704             fscal            = felec;
705
706             fscal            = _mm_andnot_ps(dummy_mask,fscal);
707
708              /* Update vectorial force */
709             fix1             = _mm_macc_ps(dx10,fscal,fix1);
710             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
711             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
712
713             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
714             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
715             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
716
717             /**************************
718              * CALCULATE INTERACTIONS *
719              **************************/
720
721             /* COULOMB ELECTROSTATICS */
722             velec            = _mm_mul_ps(qq11,rinv11);
723             felec            = _mm_mul_ps(velec,rinvsq11);
724
725             /* Update potential sum for this i atom from the interaction with this j atom. */
726             velec            = _mm_andnot_ps(dummy_mask,velec);
727             velecsum         = _mm_add_ps(velecsum,velec);
728
729             fscal            = felec;
730
731             fscal            = _mm_andnot_ps(dummy_mask,fscal);
732
733              /* Update vectorial force */
734             fix1             = _mm_macc_ps(dx11,fscal,fix1);
735             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
736             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
737
738             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
739             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
740             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
741
742             /**************************
743              * CALCULATE INTERACTIONS *
744              **************************/
745
746             /* COULOMB ELECTROSTATICS */
747             velec            = _mm_mul_ps(qq12,rinv12);
748             felec            = _mm_mul_ps(velec,rinvsq12);
749
750             /* Update potential sum for this i atom from the interaction with this j atom. */
751             velec            = _mm_andnot_ps(dummy_mask,velec);
752             velecsum         = _mm_add_ps(velecsum,velec);
753
754             fscal            = felec;
755
756             fscal            = _mm_andnot_ps(dummy_mask,fscal);
757
758              /* Update vectorial force */
759             fix1             = _mm_macc_ps(dx12,fscal,fix1);
760             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
761             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
762
763             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
764             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
765             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
766
767             /**************************
768              * CALCULATE INTERACTIONS *
769              **************************/
770
771             /* COULOMB ELECTROSTATICS */
772             velec            = _mm_mul_ps(qq20,rinv20);
773             felec            = _mm_mul_ps(velec,rinvsq20);
774
775             /* Update potential sum for this i atom from the interaction with this j atom. */
776             velec            = _mm_andnot_ps(dummy_mask,velec);
777             velecsum         = _mm_add_ps(velecsum,velec);
778
779             fscal            = felec;
780
781             fscal            = _mm_andnot_ps(dummy_mask,fscal);
782
783              /* Update vectorial force */
784             fix2             = _mm_macc_ps(dx20,fscal,fix2);
785             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
786             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
787
788             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
789             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
790             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
791
792             /**************************
793              * CALCULATE INTERACTIONS *
794              **************************/
795
796             /* COULOMB ELECTROSTATICS */
797             velec            = _mm_mul_ps(qq21,rinv21);
798             felec            = _mm_mul_ps(velec,rinvsq21);
799
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm_andnot_ps(dummy_mask,velec);
802             velecsum         = _mm_add_ps(velecsum,velec);
803
804             fscal            = felec;
805
806             fscal            = _mm_andnot_ps(dummy_mask,fscal);
807
808              /* Update vectorial force */
809             fix2             = _mm_macc_ps(dx21,fscal,fix2);
810             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
811             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
812
813             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
814             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
815             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
816
817             /**************************
818              * CALCULATE INTERACTIONS *
819              **************************/
820
821             /* COULOMB ELECTROSTATICS */
822             velec            = _mm_mul_ps(qq22,rinv22);
823             felec            = _mm_mul_ps(velec,rinvsq22);
824
825             /* Update potential sum for this i atom from the interaction with this j atom. */
826             velec            = _mm_andnot_ps(dummy_mask,velec);
827             velecsum         = _mm_add_ps(velecsum,velec);
828
829             fscal            = felec;
830
831             fscal            = _mm_andnot_ps(dummy_mask,fscal);
832
833              /* Update vectorial force */
834             fix2             = _mm_macc_ps(dx22,fscal,fix2);
835             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
836             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
837
838             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
839             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
840             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
841
842             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
843             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
844             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
845             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
846
847             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
848                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
849
850             /* Inner loop uses 291 flops */
851         }
852
853         /* End of innermost loop */
854
855         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
856                                               f+i_coord_offset,fshift+i_shift_offset);
857
858         ggid                        = gid[iidx];
859         /* Update potential energies */
860         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
861         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
862
863         /* Increment number of inner iterations */
864         inneriter                  += j_index_end - j_index_start;
865
866         /* Outer loop uses 20 flops */
867     }
868
869     /* Increment number of outer iterations */
870     outeriter        += nri;
871
872     /* Update outer/inner flops */
873
874     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*291);
875 }
876 /*
877  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_single
878  * Electrostatics interaction: Coulomb
879  * VdW interaction:            LennardJones
880  * Geometry:                   Water3-Water3
881  * Calculate force/pot:        Force
882  */
883 void
884 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_single
885                     (t_nblist                    * gmx_restrict       nlist,
886                      rvec                        * gmx_restrict          xx,
887                      rvec                        * gmx_restrict          ff,
888                      t_forcerec                  * gmx_restrict          fr,
889                      t_mdatoms                   * gmx_restrict     mdatoms,
890                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
891                      t_nrnb                      * gmx_restrict        nrnb)
892 {
893     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
894      * just 0 for non-waters.
895      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
896      * jnr indices corresponding to data put in the four positions in the SIMD register.
897      */
898     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
899     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
900     int              jnrA,jnrB,jnrC,jnrD;
901     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
902     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
903     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
904     real             rcutoff_scalar;
905     real             *shiftvec,*fshift,*x,*f;
906     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
907     real             scratch[4*DIM];
908     __m128           fscal,rcutoff,rcutoff2,jidxall;
909     int              vdwioffset0;
910     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
911     int              vdwioffset1;
912     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
913     int              vdwioffset2;
914     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
915     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
916     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
917     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
918     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
919     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
920     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
921     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
922     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
923     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
924     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
925     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
926     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
927     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
928     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
929     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
930     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
931     real             *charge;
932     int              nvdwtype;
933     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
934     int              *vdwtype;
935     real             *vdwparam;
936     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
937     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
938     __m128           dummy_mask,cutoff_mask;
939     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
940     __m128           one     = _mm_set1_ps(1.0);
941     __m128           two     = _mm_set1_ps(2.0);
942     x                = xx[0];
943     f                = ff[0];
944
945     nri              = nlist->nri;
946     iinr             = nlist->iinr;
947     jindex           = nlist->jindex;
948     jjnr             = nlist->jjnr;
949     shiftidx         = nlist->shift;
950     gid              = nlist->gid;
951     shiftvec         = fr->shift_vec[0];
952     fshift           = fr->fshift[0];
953     facel            = _mm_set1_ps(fr->epsfac);
954     charge           = mdatoms->chargeA;
955     nvdwtype         = fr->ntype;
956     vdwparam         = fr->nbfp;
957     vdwtype          = mdatoms->typeA;
958
959     /* Setup water-specific parameters */
960     inr              = nlist->iinr[0];
961     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
962     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
963     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
964     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
965
966     jq0              = _mm_set1_ps(charge[inr+0]);
967     jq1              = _mm_set1_ps(charge[inr+1]);
968     jq2              = _mm_set1_ps(charge[inr+2]);
969     vdwjidx0A        = 2*vdwtype[inr+0];
970     qq00             = _mm_mul_ps(iq0,jq0);
971     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
972     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
973     qq01             = _mm_mul_ps(iq0,jq1);
974     qq02             = _mm_mul_ps(iq0,jq2);
975     qq10             = _mm_mul_ps(iq1,jq0);
976     qq11             = _mm_mul_ps(iq1,jq1);
977     qq12             = _mm_mul_ps(iq1,jq2);
978     qq20             = _mm_mul_ps(iq2,jq0);
979     qq21             = _mm_mul_ps(iq2,jq1);
980     qq22             = _mm_mul_ps(iq2,jq2);
981
982     /* Avoid stupid compiler warnings */
983     jnrA = jnrB = jnrC = jnrD = 0;
984     j_coord_offsetA = 0;
985     j_coord_offsetB = 0;
986     j_coord_offsetC = 0;
987     j_coord_offsetD = 0;
988
989     outeriter        = 0;
990     inneriter        = 0;
991
992     for(iidx=0;iidx<4*DIM;iidx++)
993     {
994         scratch[iidx] = 0.0;
995     }
996
997     /* Start outer loop over neighborlists */
998     for(iidx=0; iidx<nri; iidx++)
999     {
1000         /* Load shift vector for this list */
1001         i_shift_offset   = DIM*shiftidx[iidx];
1002
1003         /* Load limits for loop over neighbors */
1004         j_index_start    = jindex[iidx];
1005         j_index_end      = jindex[iidx+1];
1006
1007         /* Get outer coordinate index */
1008         inr              = iinr[iidx];
1009         i_coord_offset   = DIM*inr;
1010
1011         /* Load i particle coords and add shift vector */
1012         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1013                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1014
1015         fix0             = _mm_setzero_ps();
1016         fiy0             = _mm_setzero_ps();
1017         fiz0             = _mm_setzero_ps();
1018         fix1             = _mm_setzero_ps();
1019         fiy1             = _mm_setzero_ps();
1020         fiz1             = _mm_setzero_ps();
1021         fix2             = _mm_setzero_ps();
1022         fiy2             = _mm_setzero_ps();
1023         fiz2             = _mm_setzero_ps();
1024
1025         /* Start inner kernel loop */
1026         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1027         {
1028
1029             /* Get j neighbor index, and coordinate index */
1030             jnrA             = jjnr[jidx];
1031             jnrB             = jjnr[jidx+1];
1032             jnrC             = jjnr[jidx+2];
1033             jnrD             = jjnr[jidx+3];
1034             j_coord_offsetA  = DIM*jnrA;
1035             j_coord_offsetB  = DIM*jnrB;
1036             j_coord_offsetC  = DIM*jnrC;
1037             j_coord_offsetD  = DIM*jnrD;
1038
1039             /* load j atom coordinates */
1040             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1041                                               x+j_coord_offsetC,x+j_coord_offsetD,
1042                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1043
1044             /* Calculate displacement vector */
1045             dx00             = _mm_sub_ps(ix0,jx0);
1046             dy00             = _mm_sub_ps(iy0,jy0);
1047             dz00             = _mm_sub_ps(iz0,jz0);
1048             dx01             = _mm_sub_ps(ix0,jx1);
1049             dy01             = _mm_sub_ps(iy0,jy1);
1050             dz01             = _mm_sub_ps(iz0,jz1);
1051             dx02             = _mm_sub_ps(ix0,jx2);
1052             dy02             = _mm_sub_ps(iy0,jy2);
1053             dz02             = _mm_sub_ps(iz0,jz2);
1054             dx10             = _mm_sub_ps(ix1,jx0);
1055             dy10             = _mm_sub_ps(iy1,jy0);
1056             dz10             = _mm_sub_ps(iz1,jz0);
1057             dx11             = _mm_sub_ps(ix1,jx1);
1058             dy11             = _mm_sub_ps(iy1,jy1);
1059             dz11             = _mm_sub_ps(iz1,jz1);
1060             dx12             = _mm_sub_ps(ix1,jx2);
1061             dy12             = _mm_sub_ps(iy1,jy2);
1062             dz12             = _mm_sub_ps(iz1,jz2);
1063             dx20             = _mm_sub_ps(ix2,jx0);
1064             dy20             = _mm_sub_ps(iy2,jy0);
1065             dz20             = _mm_sub_ps(iz2,jz0);
1066             dx21             = _mm_sub_ps(ix2,jx1);
1067             dy21             = _mm_sub_ps(iy2,jy1);
1068             dz21             = _mm_sub_ps(iz2,jz1);
1069             dx22             = _mm_sub_ps(ix2,jx2);
1070             dy22             = _mm_sub_ps(iy2,jy2);
1071             dz22             = _mm_sub_ps(iz2,jz2);
1072
1073             /* Calculate squared distance and things based on it */
1074             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1075             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1076             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1077             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1078             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1079             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1080             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1081             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1082             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1083
1084             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1085             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1086             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1087             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1088             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1089             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1090             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1091             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1092             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1093
1094             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1095             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1096             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1097             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1098             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1099             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1100             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1101             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1102             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1103
1104             fjx0             = _mm_setzero_ps();
1105             fjy0             = _mm_setzero_ps();
1106             fjz0             = _mm_setzero_ps();
1107             fjx1             = _mm_setzero_ps();
1108             fjy1             = _mm_setzero_ps();
1109             fjz1             = _mm_setzero_ps();
1110             fjx2             = _mm_setzero_ps();
1111             fjy2             = _mm_setzero_ps();
1112             fjz2             = _mm_setzero_ps();
1113
1114             /**************************
1115              * CALCULATE INTERACTIONS *
1116              **************************/
1117
1118             /* COULOMB ELECTROSTATICS */
1119             velec            = _mm_mul_ps(qq00,rinv00);
1120             felec            = _mm_mul_ps(velec,rinvsq00);
1121
1122             /* LENNARD-JONES DISPERSION/REPULSION */
1123
1124             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1125             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1126
1127             fscal            = _mm_add_ps(felec,fvdw);
1128
1129              /* Update vectorial force */
1130             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1131             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1132             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1133
1134             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1135             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1136             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1137
1138             /**************************
1139              * CALCULATE INTERACTIONS *
1140              **************************/
1141
1142             /* COULOMB ELECTROSTATICS */
1143             velec            = _mm_mul_ps(qq01,rinv01);
1144             felec            = _mm_mul_ps(velec,rinvsq01);
1145
1146             fscal            = felec;
1147
1148              /* Update vectorial force */
1149             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1150             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1151             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1152
1153             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1154             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1155             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1156
1157             /**************************
1158              * CALCULATE INTERACTIONS *
1159              **************************/
1160
1161             /* COULOMB ELECTROSTATICS */
1162             velec            = _mm_mul_ps(qq02,rinv02);
1163             felec            = _mm_mul_ps(velec,rinvsq02);
1164
1165             fscal            = felec;
1166
1167              /* Update vectorial force */
1168             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1169             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1170             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1171
1172             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1173             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1174             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1175
1176             /**************************
1177              * CALCULATE INTERACTIONS *
1178              **************************/
1179
1180             /* COULOMB ELECTROSTATICS */
1181             velec            = _mm_mul_ps(qq10,rinv10);
1182             felec            = _mm_mul_ps(velec,rinvsq10);
1183
1184             fscal            = felec;
1185
1186              /* Update vectorial force */
1187             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1188             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1189             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1190
1191             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1192             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1193             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1194
1195             /**************************
1196              * CALCULATE INTERACTIONS *
1197              **************************/
1198
1199             /* COULOMB ELECTROSTATICS */
1200             velec            = _mm_mul_ps(qq11,rinv11);
1201             felec            = _mm_mul_ps(velec,rinvsq11);
1202
1203             fscal            = felec;
1204
1205              /* Update vectorial force */
1206             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1207             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1208             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1209
1210             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1211             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1212             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1213
1214             /**************************
1215              * CALCULATE INTERACTIONS *
1216              **************************/
1217
1218             /* COULOMB ELECTROSTATICS */
1219             velec            = _mm_mul_ps(qq12,rinv12);
1220             felec            = _mm_mul_ps(velec,rinvsq12);
1221
1222             fscal            = felec;
1223
1224              /* Update vectorial force */
1225             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1226             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1227             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1228
1229             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1230             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1231             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1232
1233             /**************************
1234              * CALCULATE INTERACTIONS *
1235              **************************/
1236
1237             /* COULOMB ELECTROSTATICS */
1238             velec            = _mm_mul_ps(qq20,rinv20);
1239             felec            = _mm_mul_ps(velec,rinvsq20);
1240
1241             fscal            = felec;
1242
1243              /* Update vectorial force */
1244             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1245             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1246             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1247
1248             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1249             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1250             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1251
1252             /**************************
1253              * CALCULATE INTERACTIONS *
1254              **************************/
1255
1256             /* COULOMB ELECTROSTATICS */
1257             velec            = _mm_mul_ps(qq21,rinv21);
1258             felec            = _mm_mul_ps(velec,rinvsq21);
1259
1260             fscal            = felec;
1261
1262              /* Update vectorial force */
1263             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1264             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1265             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1266
1267             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1268             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1269             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1270
1271             /**************************
1272              * CALCULATE INTERACTIONS *
1273              **************************/
1274
1275             /* COULOMB ELECTROSTATICS */
1276             velec            = _mm_mul_ps(qq22,rinv22);
1277             felec            = _mm_mul_ps(velec,rinvsq22);
1278
1279             fscal            = felec;
1280
1281              /* Update vectorial force */
1282             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1283             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1284             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1285
1286             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1287             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1288             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1289
1290             fjptrA             = f+j_coord_offsetA;
1291             fjptrB             = f+j_coord_offsetB;
1292             fjptrC             = f+j_coord_offsetC;
1293             fjptrD             = f+j_coord_offsetD;
1294
1295             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1296                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1297
1298             /* Inner loop uses 277 flops */
1299         }
1300
1301         if(jidx<j_index_end)
1302         {
1303
1304             /* Get j neighbor index, and coordinate index */
1305             jnrlistA         = jjnr[jidx];
1306             jnrlistB         = jjnr[jidx+1];
1307             jnrlistC         = jjnr[jidx+2];
1308             jnrlistD         = jjnr[jidx+3];
1309             /* Sign of each element will be negative for non-real atoms.
1310              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1311              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1312              */
1313             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1314             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1315             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1316             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1317             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1318             j_coord_offsetA  = DIM*jnrA;
1319             j_coord_offsetB  = DIM*jnrB;
1320             j_coord_offsetC  = DIM*jnrC;
1321             j_coord_offsetD  = DIM*jnrD;
1322
1323             /* load j atom coordinates */
1324             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1325                                               x+j_coord_offsetC,x+j_coord_offsetD,
1326                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1327
1328             /* Calculate displacement vector */
1329             dx00             = _mm_sub_ps(ix0,jx0);
1330             dy00             = _mm_sub_ps(iy0,jy0);
1331             dz00             = _mm_sub_ps(iz0,jz0);
1332             dx01             = _mm_sub_ps(ix0,jx1);
1333             dy01             = _mm_sub_ps(iy0,jy1);
1334             dz01             = _mm_sub_ps(iz0,jz1);
1335             dx02             = _mm_sub_ps(ix0,jx2);
1336             dy02             = _mm_sub_ps(iy0,jy2);
1337             dz02             = _mm_sub_ps(iz0,jz2);
1338             dx10             = _mm_sub_ps(ix1,jx0);
1339             dy10             = _mm_sub_ps(iy1,jy0);
1340             dz10             = _mm_sub_ps(iz1,jz0);
1341             dx11             = _mm_sub_ps(ix1,jx1);
1342             dy11             = _mm_sub_ps(iy1,jy1);
1343             dz11             = _mm_sub_ps(iz1,jz1);
1344             dx12             = _mm_sub_ps(ix1,jx2);
1345             dy12             = _mm_sub_ps(iy1,jy2);
1346             dz12             = _mm_sub_ps(iz1,jz2);
1347             dx20             = _mm_sub_ps(ix2,jx0);
1348             dy20             = _mm_sub_ps(iy2,jy0);
1349             dz20             = _mm_sub_ps(iz2,jz0);
1350             dx21             = _mm_sub_ps(ix2,jx1);
1351             dy21             = _mm_sub_ps(iy2,jy1);
1352             dz21             = _mm_sub_ps(iz2,jz1);
1353             dx22             = _mm_sub_ps(ix2,jx2);
1354             dy22             = _mm_sub_ps(iy2,jy2);
1355             dz22             = _mm_sub_ps(iz2,jz2);
1356
1357             /* Calculate squared distance and things based on it */
1358             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1359             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1360             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1361             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1362             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1363             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1364             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1365             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1366             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1367
1368             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1369             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1370             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1371             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1372             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1373             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1374             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1375             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1376             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1377
1378             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1379             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1380             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1381             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1382             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1383             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1384             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1385             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1386             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1387
1388             fjx0             = _mm_setzero_ps();
1389             fjy0             = _mm_setzero_ps();
1390             fjz0             = _mm_setzero_ps();
1391             fjx1             = _mm_setzero_ps();
1392             fjy1             = _mm_setzero_ps();
1393             fjz1             = _mm_setzero_ps();
1394             fjx2             = _mm_setzero_ps();
1395             fjy2             = _mm_setzero_ps();
1396             fjz2             = _mm_setzero_ps();
1397
1398             /**************************
1399              * CALCULATE INTERACTIONS *
1400              **************************/
1401
1402             /* COULOMB ELECTROSTATICS */
1403             velec            = _mm_mul_ps(qq00,rinv00);
1404             felec            = _mm_mul_ps(velec,rinvsq00);
1405
1406             /* LENNARD-JONES DISPERSION/REPULSION */
1407
1408             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1409             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1410
1411             fscal            = _mm_add_ps(felec,fvdw);
1412
1413             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1414
1415              /* Update vectorial force */
1416             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1417             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1418             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1419
1420             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1421             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1422             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1423
1424             /**************************
1425              * CALCULATE INTERACTIONS *
1426              **************************/
1427
1428             /* COULOMB ELECTROSTATICS */
1429             velec            = _mm_mul_ps(qq01,rinv01);
1430             felec            = _mm_mul_ps(velec,rinvsq01);
1431
1432             fscal            = felec;
1433
1434             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1435
1436              /* Update vectorial force */
1437             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1438             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1439             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1440
1441             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1442             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1443             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1444
1445             /**************************
1446              * CALCULATE INTERACTIONS *
1447              **************************/
1448
1449             /* COULOMB ELECTROSTATICS */
1450             velec            = _mm_mul_ps(qq02,rinv02);
1451             felec            = _mm_mul_ps(velec,rinvsq02);
1452
1453             fscal            = felec;
1454
1455             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1456
1457              /* Update vectorial force */
1458             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1459             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1460             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1461
1462             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1463             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1464             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1465
1466             /**************************
1467              * CALCULATE INTERACTIONS *
1468              **************************/
1469
1470             /* COULOMB ELECTROSTATICS */
1471             velec            = _mm_mul_ps(qq10,rinv10);
1472             felec            = _mm_mul_ps(velec,rinvsq10);
1473
1474             fscal            = felec;
1475
1476             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1477
1478              /* Update vectorial force */
1479             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1480             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1481             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1482
1483             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1484             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1485             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1486
1487             /**************************
1488              * CALCULATE INTERACTIONS *
1489              **************************/
1490
1491             /* COULOMB ELECTROSTATICS */
1492             velec            = _mm_mul_ps(qq11,rinv11);
1493             felec            = _mm_mul_ps(velec,rinvsq11);
1494
1495             fscal            = felec;
1496
1497             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1498
1499              /* Update vectorial force */
1500             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1501             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1502             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1503
1504             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1505             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1506             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1507
1508             /**************************
1509              * CALCULATE INTERACTIONS *
1510              **************************/
1511
1512             /* COULOMB ELECTROSTATICS */
1513             velec            = _mm_mul_ps(qq12,rinv12);
1514             felec            = _mm_mul_ps(velec,rinvsq12);
1515
1516             fscal            = felec;
1517
1518             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1519
1520              /* Update vectorial force */
1521             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1522             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1523             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1524
1525             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1526             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1527             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1528
1529             /**************************
1530              * CALCULATE INTERACTIONS *
1531              **************************/
1532
1533             /* COULOMB ELECTROSTATICS */
1534             velec            = _mm_mul_ps(qq20,rinv20);
1535             felec            = _mm_mul_ps(velec,rinvsq20);
1536
1537             fscal            = felec;
1538
1539             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1540
1541              /* Update vectorial force */
1542             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1543             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1544             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1545
1546             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1547             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1548             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1549
1550             /**************************
1551              * CALCULATE INTERACTIONS *
1552              **************************/
1553
1554             /* COULOMB ELECTROSTATICS */
1555             velec            = _mm_mul_ps(qq21,rinv21);
1556             felec            = _mm_mul_ps(velec,rinvsq21);
1557
1558             fscal            = felec;
1559
1560             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1561
1562              /* Update vectorial force */
1563             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1564             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1565             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1566
1567             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1568             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1569             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1570
1571             /**************************
1572              * CALCULATE INTERACTIONS *
1573              **************************/
1574
1575             /* COULOMB ELECTROSTATICS */
1576             velec            = _mm_mul_ps(qq22,rinv22);
1577             felec            = _mm_mul_ps(velec,rinvsq22);
1578
1579             fscal            = felec;
1580
1581             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1582
1583              /* Update vectorial force */
1584             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1585             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1586             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1587
1588             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1589             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1590             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1591
1592             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1593             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1594             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1595             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1596
1597             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1598                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1599
1600             /* Inner loop uses 277 flops */
1601         }
1602
1603         /* End of innermost loop */
1604
1605         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1606                                               f+i_coord_offset,fshift+i_shift_offset);
1607
1608         /* Increment number of inner iterations */
1609         inneriter                  += j_index_end - j_index_start;
1610
1611         /* Outer loop uses 18 flops */
1612     }
1613
1614     /* Increment number of outer iterations */
1615     outeriter        += nri;
1616
1617     /* Update outer/inner flops */
1618
1619     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
1620 }