Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwCSTab_GeomW3P1_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water3-Particle
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
93     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
94     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
95     real             *charge;
96     int              nvdwtype;
97     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98     int              *vdwtype;
99     real             *vdwparam;
100     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
101     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
102     __m128i          vfitab;
103     __m128i          ifour       = _mm_set1_epi32(4);
104     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
105     real             *vftab;
106     __m128i          ewitab;
107     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
109     real             *ewtab;
110     __m128           dummy_mask,cutoff_mask;
111     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
112     __m128           one     = _mm_set1_ps(1.0);
113     __m128           two     = _mm_set1_ps(2.0);
114     x                = xx[0];
115     f                = ff[0];
116
117     nri              = nlist->nri;
118     iinr             = nlist->iinr;
119     jindex           = nlist->jindex;
120     jjnr             = nlist->jjnr;
121     shiftidx         = nlist->shift;
122     gid              = nlist->gid;
123     shiftvec         = fr->shift_vec[0];
124     fshift           = fr->fshift[0];
125     facel            = _mm_set1_ps(fr->epsfac);
126     charge           = mdatoms->chargeA;
127     nvdwtype         = fr->ntype;
128     vdwparam         = fr->nbfp;
129     vdwtype          = mdatoms->typeA;
130
131     vftab            = kernel_data->table_vdw->data;
132     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
133
134     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
135     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
136     beta2            = _mm_mul_ps(beta,beta);
137     beta3            = _mm_mul_ps(beta,beta2);
138     ewtab            = fr->ic->tabq_coul_FDV0;
139     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
140     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
145     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
148
149     /* Avoid stupid compiler warnings */
150     jnrA = jnrB = jnrC = jnrD = 0;
151     j_coord_offsetA = 0;
152     j_coord_offsetB = 0;
153     j_coord_offsetC = 0;
154     j_coord_offsetD = 0;
155
156     outeriter        = 0;
157     inneriter        = 0;
158
159     for(iidx=0;iidx<4*DIM;iidx++)
160     {
161         scratch[iidx] = 0.0;
162     }
163
164     /* Start outer loop over neighborlists */
165     for(iidx=0; iidx<nri; iidx++)
166     {
167         /* Load shift vector for this list */
168         i_shift_offset   = DIM*shiftidx[iidx];
169
170         /* Load limits for loop over neighbors */
171         j_index_start    = jindex[iidx];
172         j_index_end      = jindex[iidx+1];
173
174         /* Get outer coordinate index */
175         inr              = iinr[iidx];
176         i_coord_offset   = DIM*inr;
177
178         /* Load i particle coords and add shift vector */
179         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
180                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
181
182         fix0             = _mm_setzero_ps();
183         fiy0             = _mm_setzero_ps();
184         fiz0             = _mm_setzero_ps();
185         fix1             = _mm_setzero_ps();
186         fiy1             = _mm_setzero_ps();
187         fiz1             = _mm_setzero_ps();
188         fix2             = _mm_setzero_ps();
189         fiy2             = _mm_setzero_ps();
190         fiz2             = _mm_setzero_ps();
191
192         /* Reset potential sums */
193         velecsum         = _mm_setzero_ps();
194         vvdwsum          = _mm_setzero_ps();
195
196         /* Start inner kernel loop */
197         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
198         {
199
200             /* Get j neighbor index, and coordinate index */
201             jnrA             = jjnr[jidx];
202             jnrB             = jjnr[jidx+1];
203             jnrC             = jjnr[jidx+2];
204             jnrD             = jjnr[jidx+3];
205             j_coord_offsetA  = DIM*jnrA;
206             j_coord_offsetB  = DIM*jnrB;
207             j_coord_offsetC  = DIM*jnrC;
208             j_coord_offsetD  = DIM*jnrD;
209
210             /* load j atom coordinates */
211             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212                                               x+j_coord_offsetC,x+j_coord_offsetD,
213                                               &jx0,&jy0,&jz0);
214
215             /* Calculate displacement vector */
216             dx00             = _mm_sub_ps(ix0,jx0);
217             dy00             = _mm_sub_ps(iy0,jy0);
218             dz00             = _mm_sub_ps(iz0,jz0);
219             dx10             = _mm_sub_ps(ix1,jx0);
220             dy10             = _mm_sub_ps(iy1,jy0);
221             dz10             = _mm_sub_ps(iz1,jz0);
222             dx20             = _mm_sub_ps(ix2,jx0);
223             dy20             = _mm_sub_ps(iy2,jy0);
224             dz20             = _mm_sub_ps(iz2,jz0);
225
226             /* Calculate squared distance and things based on it */
227             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
228             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
229             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
230
231             rinv00           = gmx_mm_invsqrt_ps(rsq00);
232             rinv10           = gmx_mm_invsqrt_ps(rsq10);
233             rinv20           = gmx_mm_invsqrt_ps(rsq20);
234
235             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
236             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
237             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
238
239             /* Load parameters for j particles */
240             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
241                                                               charge+jnrC+0,charge+jnrD+0);
242             vdwjidx0A        = 2*vdwtype[jnrA+0];
243             vdwjidx0B        = 2*vdwtype[jnrB+0];
244             vdwjidx0C        = 2*vdwtype[jnrC+0];
245             vdwjidx0D        = 2*vdwtype[jnrD+0];
246
247             fjx0             = _mm_setzero_ps();
248             fjy0             = _mm_setzero_ps();
249             fjz0             = _mm_setzero_ps();
250
251             /**************************
252              * CALCULATE INTERACTIONS *
253              **************************/
254
255             r00              = _mm_mul_ps(rsq00,rinv00);
256
257             /* Compute parameters for interactions between i and j atoms */
258             qq00             = _mm_mul_ps(iq0,jq0);
259             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
260                                          vdwparam+vdwioffset0+vdwjidx0B,
261                                          vdwparam+vdwioffset0+vdwjidx0C,
262                                          vdwparam+vdwioffset0+vdwjidx0D,
263                                          &c6_00,&c12_00);
264
265             /* Calculate table index by multiplying r with table scale and truncate to integer */
266             rt               = _mm_mul_ps(r00,vftabscale);
267             vfitab           = _mm_cvttps_epi32(rt);
268 #ifdef __XOP__
269             vfeps            = _mm_frcz_ps(rt);
270 #else
271             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
272 #endif
273             twovfeps         = _mm_add_ps(vfeps,vfeps);
274             vfitab           = _mm_slli_epi32(vfitab,3);
275
276             /* EWALD ELECTROSTATICS */
277
278             /* Analytical PME correction */
279             zeta2            = _mm_mul_ps(beta2,rsq00);
280             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
281             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
282             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
283             felec            = _mm_mul_ps(qq00,felec);
284             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
285             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
286             velec            = _mm_mul_ps(qq00,velec);
287
288             /* CUBIC SPLINE TABLE DISPERSION */
289             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
290             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
291             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
292             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
293             _MM_TRANSPOSE4_PS(Y,F,G,H);
294             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
295             VV               = _mm_macc_ps(vfeps,Fp,Y);
296             vvdw6            = _mm_mul_ps(c6_00,VV);
297             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
298             fvdw6            = _mm_mul_ps(c6_00,FF);
299
300             /* CUBIC SPLINE TABLE REPULSION */
301             vfitab           = _mm_add_epi32(vfitab,ifour);
302             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
303             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
304             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
305             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
306             _MM_TRANSPOSE4_PS(Y,F,G,H);
307             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
308             VV               = _mm_macc_ps(vfeps,Fp,Y);
309             vvdw12           = _mm_mul_ps(c12_00,VV);
310             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
311             fvdw12           = _mm_mul_ps(c12_00,FF);
312             vvdw             = _mm_add_ps(vvdw12,vvdw6);
313             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
314
315             /* Update potential sum for this i atom from the interaction with this j atom. */
316             velecsum         = _mm_add_ps(velecsum,velec);
317             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
318
319             fscal            = _mm_add_ps(felec,fvdw);
320
321              /* Update vectorial force */
322             fix0             = _mm_macc_ps(dx00,fscal,fix0);
323             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
324             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
325
326             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
327             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
328             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
329
330             /**************************
331              * CALCULATE INTERACTIONS *
332              **************************/
333
334             r10              = _mm_mul_ps(rsq10,rinv10);
335
336             /* Compute parameters for interactions between i and j atoms */
337             qq10             = _mm_mul_ps(iq1,jq0);
338
339             /* EWALD ELECTROSTATICS */
340
341             /* Analytical PME correction */
342             zeta2            = _mm_mul_ps(beta2,rsq10);
343             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
344             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
345             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
346             felec            = _mm_mul_ps(qq10,felec);
347             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
348             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
349             velec            = _mm_mul_ps(qq10,velec);
350
351             /* Update potential sum for this i atom from the interaction with this j atom. */
352             velecsum         = _mm_add_ps(velecsum,velec);
353
354             fscal            = felec;
355
356              /* Update vectorial force */
357             fix1             = _mm_macc_ps(dx10,fscal,fix1);
358             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
359             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
360
361             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
362             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
363             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
364
365             /**************************
366              * CALCULATE INTERACTIONS *
367              **************************/
368
369             r20              = _mm_mul_ps(rsq20,rinv20);
370
371             /* Compute parameters for interactions between i and j atoms */
372             qq20             = _mm_mul_ps(iq2,jq0);
373
374             /* EWALD ELECTROSTATICS */
375
376             /* Analytical PME correction */
377             zeta2            = _mm_mul_ps(beta2,rsq20);
378             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
379             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
380             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
381             felec            = _mm_mul_ps(qq20,felec);
382             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
383             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
384             velec            = _mm_mul_ps(qq20,velec);
385
386             /* Update potential sum for this i atom from the interaction with this j atom. */
387             velecsum         = _mm_add_ps(velecsum,velec);
388
389             fscal            = felec;
390
391              /* Update vectorial force */
392             fix2             = _mm_macc_ps(dx20,fscal,fix2);
393             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
394             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
395
396             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
397             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
398             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
399
400             fjptrA             = f+j_coord_offsetA;
401             fjptrB             = f+j_coord_offsetB;
402             fjptrC             = f+j_coord_offsetC;
403             fjptrD             = f+j_coord_offsetD;
404
405             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
406
407             /* Inner loop uses 121 flops */
408         }
409
410         if(jidx<j_index_end)
411         {
412
413             /* Get j neighbor index, and coordinate index */
414             jnrlistA         = jjnr[jidx];
415             jnrlistB         = jjnr[jidx+1];
416             jnrlistC         = jjnr[jidx+2];
417             jnrlistD         = jjnr[jidx+3];
418             /* Sign of each element will be negative for non-real atoms.
419              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
420              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
421              */
422             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
423             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
424             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
425             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
426             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
427             j_coord_offsetA  = DIM*jnrA;
428             j_coord_offsetB  = DIM*jnrB;
429             j_coord_offsetC  = DIM*jnrC;
430             j_coord_offsetD  = DIM*jnrD;
431
432             /* load j atom coordinates */
433             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
434                                               x+j_coord_offsetC,x+j_coord_offsetD,
435                                               &jx0,&jy0,&jz0);
436
437             /* Calculate displacement vector */
438             dx00             = _mm_sub_ps(ix0,jx0);
439             dy00             = _mm_sub_ps(iy0,jy0);
440             dz00             = _mm_sub_ps(iz0,jz0);
441             dx10             = _mm_sub_ps(ix1,jx0);
442             dy10             = _mm_sub_ps(iy1,jy0);
443             dz10             = _mm_sub_ps(iz1,jz0);
444             dx20             = _mm_sub_ps(ix2,jx0);
445             dy20             = _mm_sub_ps(iy2,jy0);
446             dz20             = _mm_sub_ps(iz2,jz0);
447
448             /* Calculate squared distance and things based on it */
449             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
450             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
451             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
452
453             rinv00           = gmx_mm_invsqrt_ps(rsq00);
454             rinv10           = gmx_mm_invsqrt_ps(rsq10);
455             rinv20           = gmx_mm_invsqrt_ps(rsq20);
456
457             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
458             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
459             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
460
461             /* Load parameters for j particles */
462             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
463                                                               charge+jnrC+0,charge+jnrD+0);
464             vdwjidx0A        = 2*vdwtype[jnrA+0];
465             vdwjidx0B        = 2*vdwtype[jnrB+0];
466             vdwjidx0C        = 2*vdwtype[jnrC+0];
467             vdwjidx0D        = 2*vdwtype[jnrD+0];
468
469             fjx0             = _mm_setzero_ps();
470             fjy0             = _mm_setzero_ps();
471             fjz0             = _mm_setzero_ps();
472
473             /**************************
474              * CALCULATE INTERACTIONS *
475              **************************/
476
477             r00              = _mm_mul_ps(rsq00,rinv00);
478             r00              = _mm_andnot_ps(dummy_mask,r00);
479
480             /* Compute parameters for interactions between i and j atoms */
481             qq00             = _mm_mul_ps(iq0,jq0);
482             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
483                                          vdwparam+vdwioffset0+vdwjidx0B,
484                                          vdwparam+vdwioffset0+vdwjidx0C,
485                                          vdwparam+vdwioffset0+vdwjidx0D,
486                                          &c6_00,&c12_00);
487
488             /* Calculate table index by multiplying r with table scale and truncate to integer */
489             rt               = _mm_mul_ps(r00,vftabscale);
490             vfitab           = _mm_cvttps_epi32(rt);
491 #ifdef __XOP__
492             vfeps            = _mm_frcz_ps(rt);
493 #else
494             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
495 #endif
496             twovfeps         = _mm_add_ps(vfeps,vfeps);
497             vfitab           = _mm_slli_epi32(vfitab,3);
498
499             /* EWALD ELECTROSTATICS */
500
501             /* Analytical PME correction */
502             zeta2            = _mm_mul_ps(beta2,rsq00);
503             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
504             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
505             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
506             felec            = _mm_mul_ps(qq00,felec);
507             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
508             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
509             velec            = _mm_mul_ps(qq00,velec);
510
511             /* CUBIC SPLINE TABLE DISPERSION */
512             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
513             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
514             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
515             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
516             _MM_TRANSPOSE4_PS(Y,F,G,H);
517             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
518             VV               = _mm_macc_ps(vfeps,Fp,Y);
519             vvdw6            = _mm_mul_ps(c6_00,VV);
520             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
521             fvdw6            = _mm_mul_ps(c6_00,FF);
522
523             /* CUBIC SPLINE TABLE REPULSION */
524             vfitab           = _mm_add_epi32(vfitab,ifour);
525             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
526             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
527             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
528             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
529             _MM_TRANSPOSE4_PS(Y,F,G,H);
530             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
531             VV               = _mm_macc_ps(vfeps,Fp,Y);
532             vvdw12           = _mm_mul_ps(c12_00,VV);
533             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
534             fvdw12           = _mm_mul_ps(c12_00,FF);
535             vvdw             = _mm_add_ps(vvdw12,vvdw6);
536             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
537
538             /* Update potential sum for this i atom from the interaction with this j atom. */
539             velec            = _mm_andnot_ps(dummy_mask,velec);
540             velecsum         = _mm_add_ps(velecsum,velec);
541             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
542             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
543
544             fscal            = _mm_add_ps(felec,fvdw);
545
546             fscal            = _mm_andnot_ps(dummy_mask,fscal);
547
548              /* Update vectorial force */
549             fix0             = _mm_macc_ps(dx00,fscal,fix0);
550             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
551             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
552
553             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
554             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
555             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
556
557             /**************************
558              * CALCULATE INTERACTIONS *
559              **************************/
560
561             r10              = _mm_mul_ps(rsq10,rinv10);
562             r10              = _mm_andnot_ps(dummy_mask,r10);
563
564             /* Compute parameters for interactions between i and j atoms */
565             qq10             = _mm_mul_ps(iq1,jq0);
566
567             /* EWALD ELECTROSTATICS */
568
569             /* Analytical PME correction */
570             zeta2            = _mm_mul_ps(beta2,rsq10);
571             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
572             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
573             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
574             felec            = _mm_mul_ps(qq10,felec);
575             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
576             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
577             velec            = _mm_mul_ps(qq10,velec);
578
579             /* Update potential sum for this i atom from the interaction with this j atom. */
580             velec            = _mm_andnot_ps(dummy_mask,velec);
581             velecsum         = _mm_add_ps(velecsum,velec);
582
583             fscal            = felec;
584
585             fscal            = _mm_andnot_ps(dummy_mask,fscal);
586
587              /* Update vectorial force */
588             fix1             = _mm_macc_ps(dx10,fscal,fix1);
589             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
590             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
591
592             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
593             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
594             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
595
596             /**************************
597              * CALCULATE INTERACTIONS *
598              **************************/
599
600             r20              = _mm_mul_ps(rsq20,rinv20);
601             r20              = _mm_andnot_ps(dummy_mask,r20);
602
603             /* Compute parameters for interactions between i and j atoms */
604             qq20             = _mm_mul_ps(iq2,jq0);
605
606             /* EWALD ELECTROSTATICS */
607
608             /* Analytical PME correction */
609             zeta2            = _mm_mul_ps(beta2,rsq20);
610             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
611             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
612             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
613             felec            = _mm_mul_ps(qq20,felec);
614             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
615             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
616             velec            = _mm_mul_ps(qq20,velec);
617
618             /* Update potential sum for this i atom from the interaction with this j atom. */
619             velec            = _mm_andnot_ps(dummy_mask,velec);
620             velecsum         = _mm_add_ps(velecsum,velec);
621
622             fscal            = felec;
623
624             fscal            = _mm_andnot_ps(dummy_mask,fscal);
625
626              /* Update vectorial force */
627             fix2             = _mm_macc_ps(dx20,fscal,fix2);
628             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
629             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
630
631             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
632             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
633             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
634
635             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
636             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
637             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
638             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
639
640             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
641
642             /* Inner loop uses 124 flops */
643         }
644
645         /* End of innermost loop */
646
647         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
648                                               f+i_coord_offset,fshift+i_shift_offset);
649
650         ggid                        = gid[iidx];
651         /* Update potential energies */
652         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
653         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
654
655         /* Increment number of inner iterations */
656         inneriter                  += j_index_end - j_index_start;
657
658         /* Outer loop uses 20 flops */
659     }
660
661     /* Increment number of outer iterations */
662     outeriter        += nri;
663
664     /* Update outer/inner flops */
665
666     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*124);
667 }
668 /*
669  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
670  * Electrostatics interaction: Ewald
671  * VdW interaction:            CubicSplineTable
672  * Geometry:                   Water3-Particle
673  * Calculate force/pot:        Force
674  */
675 void
676 nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
677                     (t_nblist                    * gmx_restrict       nlist,
678                      rvec                        * gmx_restrict          xx,
679                      rvec                        * gmx_restrict          ff,
680                      t_forcerec                  * gmx_restrict          fr,
681                      t_mdatoms                   * gmx_restrict     mdatoms,
682                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
683                      t_nrnb                      * gmx_restrict        nrnb)
684 {
685     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
686      * just 0 for non-waters.
687      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
688      * jnr indices corresponding to data put in the four positions in the SIMD register.
689      */
690     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
691     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
692     int              jnrA,jnrB,jnrC,jnrD;
693     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
694     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
695     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
696     real             rcutoff_scalar;
697     real             *shiftvec,*fshift,*x,*f;
698     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
699     real             scratch[4*DIM];
700     __m128           fscal,rcutoff,rcutoff2,jidxall;
701     int              vdwioffset0;
702     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
703     int              vdwioffset1;
704     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
705     int              vdwioffset2;
706     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
707     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
708     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
709     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
710     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
711     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
712     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
713     real             *charge;
714     int              nvdwtype;
715     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
716     int              *vdwtype;
717     real             *vdwparam;
718     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
719     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
720     __m128i          vfitab;
721     __m128i          ifour       = _mm_set1_epi32(4);
722     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
723     real             *vftab;
724     __m128i          ewitab;
725     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
726     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
727     real             *ewtab;
728     __m128           dummy_mask,cutoff_mask;
729     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
730     __m128           one     = _mm_set1_ps(1.0);
731     __m128           two     = _mm_set1_ps(2.0);
732     x                = xx[0];
733     f                = ff[0];
734
735     nri              = nlist->nri;
736     iinr             = nlist->iinr;
737     jindex           = nlist->jindex;
738     jjnr             = nlist->jjnr;
739     shiftidx         = nlist->shift;
740     gid              = nlist->gid;
741     shiftvec         = fr->shift_vec[0];
742     fshift           = fr->fshift[0];
743     facel            = _mm_set1_ps(fr->epsfac);
744     charge           = mdatoms->chargeA;
745     nvdwtype         = fr->ntype;
746     vdwparam         = fr->nbfp;
747     vdwtype          = mdatoms->typeA;
748
749     vftab            = kernel_data->table_vdw->data;
750     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
751
752     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
753     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
754     beta2            = _mm_mul_ps(beta,beta);
755     beta3            = _mm_mul_ps(beta,beta2);
756     ewtab            = fr->ic->tabq_coul_F;
757     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
758     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
759
760     /* Setup water-specific parameters */
761     inr              = nlist->iinr[0];
762     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
763     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
764     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
765     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
766
767     /* Avoid stupid compiler warnings */
768     jnrA = jnrB = jnrC = jnrD = 0;
769     j_coord_offsetA = 0;
770     j_coord_offsetB = 0;
771     j_coord_offsetC = 0;
772     j_coord_offsetD = 0;
773
774     outeriter        = 0;
775     inneriter        = 0;
776
777     for(iidx=0;iidx<4*DIM;iidx++)
778     {
779         scratch[iidx] = 0.0;
780     }
781
782     /* Start outer loop over neighborlists */
783     for(iidx=0; iidx<nri; iidx++)
784     {
785         /* Load shift vector for this list */
786         i_shift_offset   = DIM*shiftidx[iidx];
787
788         /* Load limits for loop over neighbors */
789         j_index_start    = jindex[iidx];
790         j_index_end      = jindex[iidx+1];
791
792         /* Get outer coordinate index */
793         inr              = iinr[iidx];
794         i_coord_offset   = DIM*inr;
795
796         /* Load i particle coords and add shift vector */
797         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
798                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
799
800         fix0             = _mm_setzero_ps();
801         fiy0             = _mm_setzero_ps();
802         fiz0             = _mm_setzero_ps();
803         fix1             = _mm_setzero_ps();
804         fiy1             = _mm_setzero_ps();
805         fiz1             = _mm_setzero_ps();
806         fix2             = _mm_setzero_ps();
807         fiy2             = _mm_setzero_ps();
808         fiz2             = _mm_setzero_ps();
809
810         /* Start inner kernel loop */
811         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
812         {
813
814             /* Get j neighbor index, and coordinate index */
815             jnrA             = jjnr[jidx];
816             jnrB             = jjnr[jidx+1];
817             jnrC             = jjnr[jidx+2];
818             jnrD             = jjnr[jidx+3];
819             j_coord_offsetA  = DIM*jnrA;
820             j_coord_offsetB  = DIM*jnrB;
821             j_coord_offsetC  = DIM*jnrC;
822             j_coord_offsetD  = DIM*jnrD;
823
824             /* load j atom coordinates */
825             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
826                                               x+j_coord_offsetC,x+j_coord_offsetD,
827                                               &jx0,&jy0,&jz0);
828
829             /* Calculate displacement vector */
830             dx00             = _mm_sub_ps(ix0,jx0);
831             dy00             = _mm_sub_ps(iy0,jy0);
832             dz00             = _mm_sub_ps(iz0,jz0);
833             dx10             = _mm_sub_ps(ix1,jx0);
834             dy10             = _mm_sub_ps(iy1,jy0);
835             dz10             = _mm_sub_ps(iz1,jz0);
836             dx20             = _mm_sub_ps(ix2,jx0);
837             dy20             = _mm_sub_ps(iy2,jy0);
838             dz20             = _mm_sub_ps(iz2,jz0);
839
840             /* Calculate squared distance and things based on it */
841             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
842             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
843             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
844
845             rinv00           = gmx_mm_invsqrt_ps(rsq00);
846             rinv10           = gmx_mm_invsqrt_ps(rsq10);
847             rinv20           = gmx_mm_invsqrt_ps(rsq20);
848
849             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
850             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
851             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
852
853             /* Load parameters for j particles */
854             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
855                                                               charge+jnrC+0,charge+jnrD+0);
856             vdwjidx0A        = 2*vdwtype[jnrA+0];
857             vdwjidx0B        = 2*vdwtype[jnrB+0];
858             vdwjidx0C        = 2*vdwtype[jnrC+0];
859             vdwjidx0D        = 2*vdwtype[jnrD+0];
860
861             fjx0             = _mm_setzero_ps();
862             fjy0             = _mm_setzero_ps();
863             fjz0             = _mm_setzero_ps();
864
865             /**************************
866              * CALCULATE INTERACTIONS *
867              **************************/
868
869             r00              = _mm_mul_ps(rsq00,rinv00);
870
871             /* Compute parameters for interactions between i and j atoms */
872             qq00             = _mm_mul_ps(iq0,jq0);
873             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
874                                          vdwparam+vdwioffset0+vdwjidx0B,
875                                          vdwparam+vdwioffset0+vdwjidx0C,
876                                          vdwparam+vdwioffset0+vdwjidx0D,
877                                          &c6_00,&c12_00);
878
879             /* Calculate table index by multiplying r with table scale and truncate to integer */
880             rt               = _mm_mul_ps(r00,vftabscale);
881             vfitab           = _mm_cvttps_epi32(rt);
882 #ifdef __XOP__
883             vfeps            = _mm_frcz_ps(rt);
884 #else
885             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
886 #endif
887             twovfeps         = _mm_add_ps(vfeps,vfeps);
888             vfitab           = _mm_slli_epi32(vfitab,3);
889
890             /* EWALD ELECTROSTATICS */
891
892             /* Analytical PME correction */
893             zeta2            = _mm_mul_ps(beta2,rsq00);
894             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
895             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
896             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
897             felec            = _mm_mul_ps(qq00,felec);
898
899             /* CUBIC SPLINE TABLE DISPERSION */
900             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
901             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
902             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
903             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
904             _MM_TRANSPOSE4_PS(Y,F,G,H);
905             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
906             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
907             fvdw6            = _mm_mul_ps(c6_00,FF);
908
909             /* CUBIC SPLINE TABLE REPULSION */
910             vfitab           = _mm_add_epi32(vfitab,ifour);
911             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
912             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
913             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
914             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
915             _MM_TRANSPOSE4_PS(Y,F,G,H);
916             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
917             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
918             fvdw12           = _mm_mul_ps(c12_00,FF);
919             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
920
921             fscal            = _mm_add_ps(felec,fvdw);
922
923              /* Update vectorial force */
924             fix0             = _mm_macc_ps(dx00,fscal,fix0);
925             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
926             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
927
928             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
929             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
930             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
931
932             /**************************
933              * CALCULATE INTERACTIONS *
934              **************************/
935
936             r10              = _mm_mul_ps(rsq10,rinv10);
937
938             /* Compute parameters for interactions between i and j atoms */
939             qq10             = _mm_mul_ps(iq1,jq0);
940
941             /* EWALD ELECTROSTATICS */
942
943             /* Analytical PME correction */
944             zeta2            = _mm_mul_ps(beta2,rsq10);
945             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
946             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
947             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
948             felec            = _mm_mul_ps(qq10,felec);
949
950             fscal            = felec;
951
952              /* Update vectorial force */
953             fix1             = _mm_macc_ps(dx10,fscal,fix1);
954             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
955             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
956
957             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
958             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
959             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
960
961             /**************************
962              * CALCULATE INTERACTIONS *
963              **************************/
964
965             r20              = _mm_mul_ps(rsq20,rinv20);
966
967             /* Compute parameters for interactions between i and j atoms */
968             qq20             = _mm_mul_ps(iq2,jq0);
969
970             /* EWALD ELECTROSTATICS */
971
972             /* Analytical PME correction */
973             zeta2            = _mm_mul_ps(beta2,rsq20);
974             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
975             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
976             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
977             felec            = _mm_mul_ps(qq20,felec);
978
979             fscal            = felec;
980
981              /* Update vectorial force */
982             fix2             = _mm_macc_ps(dx20,fscal,fix2);
983             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
984             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
985
986             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
987             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
988             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
989
990             fjptrA             = f+j_coord_offsetA;
991             fjptrB             = f+j_coord_offsetB;
992             fjptrC             = f+j_coord_offsetC;
993             fjptrD             = f+j_coord_offsetD;
994
995             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
996
997             /* Inner loop uses 110 flops */
998         }
999
1000         if(jidx<j_index_end)
1001         {
1002
1003             /* Get j neighbor index, and coordinate index */
1004             jnrlistA         = jjnr[jidx];
1005             jnrlistB         = jjnr[jidx+1];
1006             jnrlistC         = jjnr[jidx+2];
1007             jnrlistD         = jjnr[jidx+3];
1008             /* Sign of each element will be negative for non-real atoms.
1009              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1010              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1011              */
1012             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1013             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1014             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1015             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1016             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1017             j_coord_offsetA  = DIM*jnrA;
1018             j_coord_offsetB  = DIM*jnrB;
1019             j_coord_offsetC  = DIM*jnrC;
1020             j_coord_offsetD  = DIM*jnrD;
1021
1022             /* load j atom coordinates */
1023             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1024                                               x+j_coord_offsetC,x+j_coord_offsetD,
1025                                               &jx0,&jy0,&jz0);
1026
1027             /* Calculate displacement vector */
1028             dx00             = _mm_sub_ps(ix0,jx0);
1029             dy00             = _mm_sub_ps(iy0,jy0);
1030             dz00             = _mm_sub_ps(iz0,jz0);
1031             dx10             = _mm_sub_ps(ix1,jx0);
1032             dy10             = _mm_sub_ps(iy1,jy0);
1033             dz10             = _mm_sub_ps(iz1,jz0);
1034             dx20             = _mm_sub_ps(ix2,jx0);
1035             dy20             = _mm_sub_ps(iy2,jy0);
1036             dz20             = _mm_sub_ps(iz2,jz0);
1037
1038             /* Calculate squared distance and things based on it */
1039             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1040             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1041             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1042
1043             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1044             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1045             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1046
1047             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1048             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1049             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1050
1051             /* Load parameters for j particles */
1052             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1053                                                               charge+jnrC+0,charge+jnrD+0);
1054             vdwjidx0A        = 2*vdwtype[jnrA+0];
1055             vdwjidx0B        = 2*vdwtype[jnrB+0];
1056             vdwjidx0C        = 2*vdwtype[jnrC+0];
1057             vdwjidx0D        = 2*vdwtype[jnrD+0];
1058
1059             fjx0             = _mm_setzero_ps();
1060             fjy0             = _mm_setzero_ps();
1061             fjz0             = _mm_setzero_ps();
1062
1063             /**************************
1064              * CALCULATE INTERACTIONS *
1065              **************************/
1066
1067             r00              = _mm_mul_ps(rsq00,rinv00);
1068             r00              = _mm_andnot_ps(dummy_mask,r00);
1069
1070             /* Compute parameters for interactions between i and j atoms */
1071             qq00             = _mm_mul_ps(iq0,jq0);
1072             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1073                                          vdwparam+vdwioffset0+vdwjidx0B,
1074                                          vdwparam+vdwioffset0+vdwjidx0C,
1075                                          vdwparam+vdwioffset0+vdwjidx0D,
1076                                          &c6_00,&c12_00);
1077
1078             /* Calculate table index by multiplying r with table scale and truncate to integer */
1079             rt               = _mm_mul_ps(r00,vftabscale);
1080             vfitab           = _mm_cvttps_epi32(rt);
1081 #ifdef __XOP__
1082             vfeps            = _mm_frcz_ps(rt);
1083 #else
1084             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1085 #endif
1086             twovfeps         = _mm_add_ps(vfeps,vfeps);
1087             vfitab           = _mm_slli_epi32(vfitab,3);
1088
1089             /* EWALD ELECTROSTATICS */
1090
1091             /* Analytical PME correction */
1092             zeta2            = _mm_mul_ps(beta2,rsq00);
1093             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1094             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1095             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1096             felec            = _mm_mul_ps(qq00,felec);
1097
1098             /* CUBIC SPLINE TABLE DISPERSION */
1099             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1100             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1101             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1102             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1103             _MM_TRANSPOSE4_PS(Y,F,G,H);
1104             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1105             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1106             fvdw6            = _mm_mul_ps(c6_00,FF);
1107
1108             /* CUBIC SPLINE TABLE REPULSION */
1109             vfitab           = _mm_add_epi32(vfitab,ifour);
1110             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1111             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1112             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1113             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1114             _MM_TRANSPOSE4_PS(Y,F,G,H);
1115             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1116             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1117             fvdw12           = _mm_mul_ps(c12_00,FF);
1118             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1119
1120             fscal            = _mm_add_ps(felec,fvdw);
1121
1122             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1123
1124              /* Update vectorial force */
1125             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1126             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1127             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1128
1129             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1130             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1131             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1132
1133             /**************************
1134              * CALCULATE INTERACTIONS *
1135              **************************/
1136
1137             r10              = _mm_mul_ps(rsq10,rinv10);
1138             r10              = _mm_andnot_ps(dummy_mask,r10);
1139
1140             /* Compute parameters for interactions between i and j atoms */
1141             qq10             = _mm_mul_ps(iq1,jq0);
1142
1143             /* EWALD ELECTROSTATICS */
1144
1145             /* Analytical PME correction */
1146             zeta2            = _mm_mul_ps(beta2,rsq10);
1147             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1148             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1149             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1150             felec            = _mm_mul_ps(qq10,felec);
1151
1152             fscal            = felec;
1153
1154             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1155
1156              /* Update vectorial force */
1157             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1158             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1159             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1160
1161             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1162             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1163             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1164
1165             /**************************
1166              * CALCULATE INTERACTIONS *
1167              **************************/
1168
1169             r20              = _mm_mul_ps(rsq20,rinv20);
1170             r20              = _mm_andnot_ps(dummy_mask,r20);
1171
1172             /* Compute parameters for interactions between i and j atoms */
1173             qq20             = _mm_mul_ps(iq2,jq0);
1174
1175             /* EWALD ELECTROSTATICS */
1176
1177             /* Analytical PME correction */
1178             zeta2            = _mm_mul_ps(beta2,rsq20);
1179             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1180             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1181             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1182             felec            = _mm_mul_ps(qq20,felec);
1183
1184             fscal            = felec;
1185
1186             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1187
1188              /* Update vectorial force */
1189             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1190             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1191             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1192
1193             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1194             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1195             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1196
1197             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1198             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1199             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1200             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1201
1202             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1203
1204             /* Inner loop uses 113 flops */
1205         }
1206
1207         /* End of innermost loop */
1208
1209         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1210                                               f+i_coord_offset,fshift+i_shift_offset);
1211
1212         /* Increment number of inner iterations */
1213         inneriter                  += j_index_end - j_index_start;
1214
1215         /* Outer loop uses 18 flops */
1216     }
1217
1218     /* Increment number of outer iterations */
1219     outeriter        += nri;
1220
1221     /* Update outer/inner flops */
1222
1223     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*113);
1224 }