made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSw_VdwNone_GeomW4P1_avx_128_fma_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            None
40  * Geometry:                   Water4-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_128_fma_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset1;
70     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71     int              vdwioffset2;
72     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73     int              vdwioffset3;
74     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79     __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
81     real             *charge;
82     __m128i          ewitab;
83     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
84     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
85     real             *ewtab;
86     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
87     real             rswitch_scalar,d_scalar;
88     __m128           dummy_mask,cutoff_mask;
89     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
90     __m128           one     = _mm_set1_ps(1.0);
91     __m128           two     = _mm_set1_ps(2.0);
92     x                = xx[0];
93     f                = ff[0];
94
95     nri              = nlist->nri;
96     iinr             = nlist->iinr;
97     jindex           = nlist->jindex;
98     jjnr             = nlist->jjnr;
99     shiftidx         = nlist->shift;
100     gid              = nlist->gid;
101     shiftvec         = fr->shift_vec[0];
102     fshift           = fr->fshift[0];
103     facel            = _mm_set1_ps(fr->epsfac);
104     charge           = mdatoms->chargeA;
105
106     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
107     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
108     beta2            = _mm_mul_ps(beta,beta);
109     beta3            = _mm_mul_ps(beta,beta2);
110     ewtab            = fr->ic->tabq_coul_FDV0;
111     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
112     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
113
114     /* Setup water-specific parameters */
115     inr              = nlist->iinr[0];
116     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
117     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
118     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
119
120     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
121     rcutoff_scalar   = fr->rcoulomb;
122     rcutoff          = _mm_set1_ps(rcutoff_scalar);
123     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
124
125     rswitch_scalar   = fr->rcoulomb_switch;
126     rswitch          = _mm_set1_ps(rswitch_scalar);
127     /* Setup switch parameters */
128     d_scalar         = rcutoff_scalar-rswitch_scalar;
129     d                = _mm_set1_ps(d_scalar);
130     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
131     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
132     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
133     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
134     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
135     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
136
137     /* Avoid stupid compiler warnings */
138     jnrA = jnrB = jnrC = jnrD = 0;
139     j_coord_offsetA = 0;
140     j_coord_offsetB = 0;
141     j_coord_offsetC = 0;
142     j_coord_offsetD = 0;
143
144     outeriter        = 0;
145     inneriter        = 0;
146
147     for(iidx=0;iidx<4*DIM;iidx++)
148     {
149         scratch[iidx] = 0.0;
150     }
151
152     /* Start outer loop over neighborlists */
153     for(iidx=0; iidx<nri; iidx++)
154     {
155         /* Load shift vector for this list */
156         i_shift_offset   = DIM*shiftidx[iidx];
157
158         /* Load limits for loop over neighbors */
159         j_index_start    = jindex[iidx];
160         j_index_end      = jindex[iidx+1];
161
162         /* Get outer coordinate index */
163         inr              = iinr[iidx];
164         i_coord_offset   = DIM*inr;
165
166         /* Load i particle coords and add shift vector */
167         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
168                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
169
170         fix1             = _mm_setzero_ps();
171         fiy1             = _mm_setzero_ps();
172         fiz1             = _mm_setzero_ps();
173         fix2             = _mm_setzero_ps();
174         fiy2             = _mm_setzero_ps();
175         fiz2             = _mm_setzero_ps();
176         fix3             = _mm_setzero_ps();
177         fiy3             = _mm_setzero_ps();
178         fiz3             = _mm_setzero_ps();
179
180         /* Reset potential sums */
181         velecsum         = _mm_setzero_ps();
182
183         /* Start inner kernel loop */
184         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
185         {
186
187             /* Get j neighbor index, and coordinate index */
188             jnrA             = jjnr[jidx];
189             jnrB             = jjnr[jidx+1];
190             jnrC             = jjnr[jidx+2];
191             jnrD             = jjnr[jidx+3];
192             j_coord_offsetA  = DIM*jnrA;
193             j_coord_offsetB  = DIM*jnrB;
194             j_coord_offsetC  = DIM*jnrC;
195             j_coord_offsetD  = DIM*jnrD;
196
197             /* load j atom coordinates */
198             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
199                                               x+j_coord_offsetC,x+j_coord_offsetD,
200                                               &jx0,&jy0,&jz0);
201
202             /* Calculate displacement vector */
203             dx10             = _mm_sub_ps(ix1,jx0);
204             dy10             = _mm_sub_ps(iy1,jy0);
205             dz10             = _mm_sub_ps(iz1,jz0);
206             dx20             = _mm_sub_ps(ix2,jx0);
207             dy20             = _mm_sub_ps(iy2,jy0);
208             dz20             = _mm_sub_ps(iz2,jz0);
209             dx30             = _mm_sub_ps(ix3,jx0);
210             dy30             = _mm_sub_ps(iy3,jy0);
211             dz30             = _mm_sub_ps(iz3,jz0);
212
213             /* Calculate squared distance and things based on it */
214             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
215             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
216             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
217
218             rinv10           = gmx_mm_invsqrt_ps(rsq10);
219             rinv20           = gmx_mm_invsqrt_ps(rsq20);
220             rinv30           = gmx_mm_invsqrt_ps(rsq30);
221
222             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
223             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
224             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
225
226             /* Load parameters for j particles */
227             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
228                                                               charge+jnrC+0,charge+jnrD+0);
229
230             fjx0             = _mm_setzero_ps();
231             fjy0             = _mm_setzero_ps();
232             fjz0             = _mm_setzero_ps();
233
234             /**************************
235              * CALCULATE INTERACTIONS *
236              **************************/
237
238             if (gmx_mm_any_lt(rsq10,rcutoff2))
239             {
240
241             r10              = _mm_mul_ps(rsq10,rinv10);
242
243             /* Compute parameters for interactions between i and j atoms */
244             qq10             = _mm_mul_ps(iq1,jq0);
245
246             /* EWALD ELECTROSTATICS */
247
248             /* Analytical PME correction */
249             zeta2            = _mm_mul_ps(beta2,rsq10);
250             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
251             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
252             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
253             felec            = _mm_mul_ps(qq10,felec);
254             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
255             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
256             velec            = _mm_mul_ps(qq10,velec);
257
258             d                = _mm_sub_ps(r10,rswitch);
259             d                = _mm_max_ps(d,_mm_setzero_ps());
260             d2               = _mm_mul_ps(d,d);
261             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
262
263             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
264
265             /* Evaluate switch function */
266             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
267             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
268             velec            = _mm_mul_ps(velec,sw);
269             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
270
271             /* Update potential sum for this i atom from the interaction with this j atom. */
272             velec            = _mm_and_ps(velec,cutoff_mask);
273             velecsum         = _mm_add_ps(velecsum,velec);
274
275             fscal            = felec;
276
277             fscal            = _mm_and_ps(fscal,cutoff_mask);
278
279              /* Update vectorial force */
280             fix1             = _mm_macc_ps(dx10,fscal,fix1);
281             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
282             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
283
284             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
285             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
286             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
287
288             }
289
290             /**************************
291              * CALCULATE INTERACTIONS *
292              **************************/
293
294             if (gmx_mm_any_lt(rsq20,rcutoff2))
295             {
296
297             r20              = _mm_mul_ps(rsq20,rinv20);
298
299             /* Compute parameters for interactions between i and j atoms */
300             qq20             = _mm_mul_ps(iq2,jq0);
301
302             /* EWALD ELECTROSTATICS */
303
304             /* Analytical PME correction */
305             zeta2            = _mm_mul_ps(beta2,rsq20);
306             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
307             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
308             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
309             felec            = _mm_mul_ps(qq20,felec);
310             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
311             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
312             velec            = _mm_mul_ps(qq20,velec);
313
314             d                = _mm_sub_ps(r20,rswitch);
315             d                = _mm_max_ps(d,_mm_setzero_ps());
316             d2               = _mm_mul_ps(d,d);
317             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
318
319             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
320
321             /* Evaluate switch function */
322             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
323             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
324             velec            = _mm_mul_ps(velec,sw);
325             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
326
327             /* Update potential sum for this i atom from the interaction with this j atom. */
328             velec            = _mm_and_ps(velec,cutoff_mask);
329             velecsum         = _mm_add_ps(velecsum,velec);
330
331             fscal            = felec;
332
333             fscal            = _mm_and_ps(fscal,cutoff_mask);
334
335              /* Update vectorial force */
336             fix2             = _mm_macc_ps(dx20,fscal,fix2);
337             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
338             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
339
340             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
341             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
342             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
343
344             }
345
346             /**************************
347              * CALCULATE INTERACTIONS *
348              **************************/
349
350             if (gmx_mm_any_lt(rsq30,rcutoff2))
351             {
352
353             r30              = _mm_mul_ps(rsq30,rinv30);
354
355             /* Compute parameters for interactions between i and j atoms */
356             qq30             = _mm_mul_ps(iq3,jq0);
357
358             /* EWALD ELECTROSTATICS */
359
360             /* Analytical PME correction */
361             zeta2            = _mm_mul_ps(beta2,rsq30);
362             rinv3            = _mm_mul_ps(rinvsq30,rinv30);
363             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
364             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
365             felec            = _mm_mul_ps(qq30,felec);
366             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
367             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv30);
368             velec            = _mm_mul_ps(qq30,velec);
369
370             d                = _mm_sub_ps(r30,rswitch);
371             d                = _mm_max_ps(d,_mm_setzero_ps());
372             d2               = _mm_mul_ps(d,d);
373             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
374
375             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
376
377             /* Evaluate switch function */
378             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
379             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
380             velec            = _mm_mul_ps(velec,sw);
381             cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
382
383             /* Update potential sum for this i atom from the interaction with this j atom. */
384             velec            = _mm_and_ps(velec,cutoff_mask);
385             velecsum         = _mm_add_ps(velecsum,velec);
386
387             fscal            = felec;
388
389             fscal            = _mm_and_ps(fscal,cutoff_mask);
390
391              /* Update vectorial force */
392             fix3             = _mm_macc_ps(dx30,fscal,fix3);
393             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
394             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
395
396             fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
397             fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
398             fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
399
400             }
401
402             fjptrA             = f+j_coord_offsetA;
403             fjptrB             = f+j_coord_offsetB;
404             fjptrC             = f+j_coord_offsetC;
405             fjptrD             = f+j_coord_offsetD;
406
407             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
408
409             /* Inner loop uses 159 flops */
410         }
411
412         if(jidx<j_index_end)
413         {
414
415             /* Get j neighbor index, and coordinate index */
416             jnrlistA         = jjnr[jidx];
417             jnrlistB         = jjnr[jidx+1];
418             jnrlistC         = jjnr[jidx+2];
419             jnrlistD         = jjnr[jidx+3];
420             /* Sign of each element will be negative for non-real atoms.
421              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
422              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
423              */
424             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
425             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
426             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
427             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
428             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
429             j_coord_offsetA  = DIM*jnrA;
430             j_coord_offsetB  = DIM*jnrB;
431             j_coord_offsetC  = DIM*jnrC;
432             j_coord_offsetD  = DIM*jnrD;
433
434             /* load j atom coordinates */
435             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
436                                               x+j_coord_offsetC,x+j_coord_offsetD,
437                                               &jx0,&jy0,&jz0);
438
439             /* Calculate displacement vector */
440             dx10             = _mm_sub_ps(ix1,jx0);
441             dy10             = _mm_sub_ps(iy1,jy0);
442             dz10             = _mm_sub_ps(iz1,jz0);
443             dx20             = _mm_sub_ps(ix2,jx0);
444             dy20             = _mm_sub_ps(iy2,jy0);
445             dz20             = _mm_sub_ps(iz2,jz0);
446             dx30             = _mm_sub_ps(ix3,jx0);
447             dy30             = _mm_sub_ps(iy3,jy0);
448             dz30             = _mm_sub_ps(iz3,jz0);
449
450             /* Calculate squared distance and things based on it */
451             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
452             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
453             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
454
455             rinv10           = gmx_mm_invsqrt_ps(rsq10);
456             rinv20           = gmx_mm_invsqrt_ps(rsq20);
457             rinv30           = gmx_mm_invsqrt_ps(rsq30);
458
459             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
460             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
461             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
462
463             /* Load parameters for j particles */
464             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
465                                                               charge+jnrC+0,charge+jnrD+0);
466
467             fjx0             = _mm_setzero_ps();
468             fjy0             = _mm_setzero_ps();
469             fjz0             = _mm_setzero_ps();
470
471             /**************************
472              * CALCULATE INTERACTIONS *
473              **************************/
474
475             if (gmx_mm_any_lt(rsq10,rcutoff2))
476             {
477
478             r10              = _mm_mul_ps(rsq10,rinv10);
479             r10              = _mm_andnot_ps(dummy_mask,r10);
480
481             /* Compute parameters for interactions between i and j atoms */
482             qq10             = _mm_mul_ps(iq1,jq0);
483
484             /* EWALD ELECTROSTATICS */
485
486             /* Analytical PME correction */
487             zeta2            = _mm_mul_ps(beta2,rsq10);
488             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
489             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
490             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
491             felec            = _mm_mul_ps(qq10,felec);
492             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
493             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
494             velec            = _mm_mul_ps(qq10,velec);
495
496             d                = _mm_sub_ps(r10,rswitch);
497             d                = _mm_max_ps(d,_mm_setzero_ps());
498             d2               = _mm_mul_ps(d,d);
499             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
500
501             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
502
503             /* Evaluate switch function */
504             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
505             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
506             velec            = _mm_mul_ps(velec,sw);
507             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velec            = _mm_and_ps(velec,cutoff_mask);
511             velec            = _mm_andnot_ps(dummy_mask,velec);
512             velecsum         = _mm_add_ps(velecsum,velec);
513
514             fscal            = felec;
515
516             fscal            = _mm_and_ps(fscal,cutoff_mask);
517
518             fscal            = _mm_andnot_ps(dummy_mask,fscal);
519
520              /* Update vectorial force */
521             fix1             = _mm_macc_ps(dx10,fscal,fix1);
522             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
523             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
524
525             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
526             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
527             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
528
529             }
530
531             /**************************
532              * CALCULATE INTERACTIONS *
533              **************************/
534
535             if (gmx_mm_any_lt(rsq20,rcutoff2))
536             {
537
538             r20              = _mm_mul_ps(rsq20,rinv20);
539             r20              = _mm_andnot_ps(dummy_mask,r20);
540
541             /* Compute parameters for interactions between i and j atoms */
542             qq20             = _mm_mul_ps(iq2,jq0);
543
544             /* EWALD ELECTROSTATICS */
545
546             /* Analytical PME correction */
547             zeta2            = _mm_mul_ps(beta2,rsq20);
548             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
549             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
550             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
551             felec            = _mm_mul_ps(qq20,felec);
552             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
553             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
554             velec            = _mm_mul_ps(qq20,velec);
555
556             d                = _mm_sub_ps(r20,rswitch);
557             d                = _mm_max_ps(d,_mm_setzero_ps());
558             d2               = _mm_mul_ps(d,d);
559             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
560
561             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
562
563             /* Evaluate switch function */
564             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
565             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
566             velec            = _mm_mul_ps(velec,sw);
567             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
568
569             /* Update potential sum for this i atom from the interaction with this j atom. */
570             velec            = _mm_and_ps(velec,cutoff_mask);
571             velec            = _mm_andnot_ps(dummy_mask,velec);
572             velecsum         = _mm_add_ps(velecsum,velec);
573
574             fscal            = felec;
575
576             fscal            = _mm_and_ps(fscal,cutoff_mask);
577
578             fscal            = _mm_andnot_ps(dummy_mask,fscal);
579
580              /* Update vectorial force */
581             fix2             = _mm_macc_ps(dx20,fscal,fix2);
582             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
583             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
584
585             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
586             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
587             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
588
589             }
590
591             /**************************
592              * CALCULATE INTERACTIONS *
593              **************************/
594
595             if (gmx_mm_any_lt(rsq30,rcutoff2))
596             {
597
598             r30              = _mm_mul_ps(rsq30,rinv30);
599             r30              = _mm_andnot_ps(dummy_mask,r30);
600
601             /* Compute parameters for interactions between i and j atoms */
602             qq30             = _mm_mul_ps(iq3,jq0);
603
604             /* EWALD ELECTROSTATICS */
605
606             /* Analytical PME correction */
607             zeta2            = _mm_mul_ps(beta2,rsq30);
608             rinv3            = _mm_mul_ps(rinvsq30,rinv30);
609             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
610             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
611             felec            = _mm_mul_ps(qq30,felec);
612             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
613             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv30);
614             velec            = _mm_mul_ps(qq30,velec);
615
616             d                = _mm_sub_ps(r30,rswitch);
617             d                = _mm_max_ps(d,_mm_setzero_ps());
618             d2               = _mm_mul_ps(d,d);
619             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
620
621             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
622
623             /* Evaluate switch function */
624             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
625             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
626             velec            = _mm_mul_ps(velec,sw);
627             cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
628
629             /* Update potential sum for this i atom from the interaction with this j atom. */
630             velec            = _mm_and_ps(velec,cutoff_mask);
631             velec            = _mm_andnot_ps(dummy_mask,velec);
632             velecsum         = _mm_add_ps(velecsum,velec);
633
634             fscal            = felec;
635
636             fscal            = _mm_and_ps(fscal,cutoff_mask);
637
638             fscal            = _mm_andnot_ps(dummy_mask,fscal);
639
640              /* Update vectorial force */
641             fix3             = _mm_macc_ps(dx30,fscal,fix3);
642             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
643             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
644
645             fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
646             fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
647             fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
648
649             }
650
651             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
652             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
653             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
654             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
655
656             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
657
658             /* Inner loop uses 162 flops */
659         }
660
661         /* End of innermost loop */
662
663         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
664                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
665
666         ggid                        = gid[iidx];
667         /* Update potential energies */
668         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
669
670         /* Increment number of inner iterations */
671         inneriter                  += j_index_end - j_index_start;
672
673         /* Outer loop uses 19 flops */
674     }
675
676     /* Increment number of outer iterations */
677     outeriter        += nri;
678
679     /* Update outer/inner flops */
680
681     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*162);
682 }
683 /*
684  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
685  * Electrostatics interaction: Ewald
686  * VdW interaction:            None
687  * Geometry:                   Water4-Particle
688  * Calculate force/pot:        Force
689  */
690 void
691 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_128_fma_single
692                     (t_nblist * gmx_restrict                nlist,
693                      rvec * gmx_restrict                    xx,
694                      rvec * gmx_restrict                    ff,
695                      t_forcerec * gmx_restrict              fr,
696                      t_mdatoms * gmx_restrict               mdatoms,
697                      nb_kernel_data_t * gmx_restrict        kernel_data,
698                      t_nrnb * gmx_restrict                  nrnb)
699 {
700     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
701      * just 0 for non-waters.
702      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
703      * jnr indices corresponding to data put in the four positions in the SIMD register.
704      */
705     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
706     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
707     int              jnrA,jnrB,jnrC,jnrD;
708     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
709     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
710     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
711     real             rcutoff_scalar;
712     real             *shiftvec,*fshift,*x,*f;
713     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
714     real             scratch[4*DIM];
715     __m128           fscal,rcutoff,rcutoff2,jidxall;
716     int              vdwioffset1;
717     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
718     int              vdwioffset2;
719     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
720     int              vdwioffset3;
721     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
722     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
723     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
724     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
725     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
726     __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
727     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
728     real             *charge;
729     __m128i          ewitab;
730     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
731     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
732     real             *ewtab;
733     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
734     real             rswitch_scalar,d_scalar;
735     __m128           dummy_mask,cutoff_mask;
736     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
737     __m128           one     = _mm_set1_ps(1.0);
738     __m128           two     = _mm_set1_ps(2.0);
739     x                = xx[0];
740     f                = ff[0];
741
742     nri              = nlist->nri;
743     iinr             = nlist->iinr;
744     jindex           = nlist->jindex;
745     jjnr             = nlist->jjnr;
746     shiftidx         = nlist->shift;
747     gid              = nlist->gid;
748     shiftvec         = fr->shift_vec[0];
749     fshift           = fr->fshift[0];
750     facel            = _mm_set1_ps(fr->epsfac);
751     charge           = mdatoms->chargeA;
752
753     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
754     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
755     beta2            = _mm_mul_ps(beta,beta);
756     beta3            = _mm_mul_ps(beta,beta2);
757     ewtab            = fr->ic->tabq_coul_FDV0;
758     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
759     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
760
761     /* Setup water-specific parameters */
762     inr              = nlist->iinr[0];
763     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
764     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
765     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
766
767     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
768     rcutoff_scalar   = fr->rcoulomb;
769     rcutoff          = _mm_set1_ps(rcutoff_scalar);
770     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
771
772     rswitch_scalar   = fr->rcoulomb_switch;
773     rswitch          = _mm_set1_ps(rswitch_scalar);
774     /* Setup switch parameters */
775     d_scalar         = rcutoff_scalar-rswitch_scalar;
776     d                = _mm_set1_ps(d_scalar);
777     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
778     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
779     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
780     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
781     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
782     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
783
784     /* Avoid stupid compiler warnings */
785     jnrA = jnrB = jnrC = jnrD = 0;
786     j_coord_offsetA = 0;
787     j_coord_offsetB = 0;
788     j_coord_offsetC = 0;
789     j_coord_offsetD = 0;
790
791     outeriter        = 0;
792     inneriter        = 0;
793
794     for(iidx=0;iidx<4*DIM;iidx++)
795     {
796         scratch[iidx] = 0.0;
797     }
798
799     /* Start outer loop over neighborlists */
800     for(iidx=0; iidx<nri; iidx++)
801     {
802         /* Load shift vector for this list */
803         i_shift_offset   = DIM*shiftidx[iidx];
804
805         /* Load limits for loop over neighbors */
806         j_index_start    = jindex[iidx];
807         j_index_end      = jindex[iidx+1];
808
809         /* Get outer coordinate index */
810         inr              = iinr[iidx];
811         i_coord_offset   = DIM*inr;
812
813         /* Load i particle coords and add shift vector */
814         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
815                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
816
817         fix1             = _mm_setzero_ps();
818         fiy1             = _mm_setzero_ps();
819         fiz1             = _mm_setzero_ps();
820         fix2             = _mm_setzero_ps();
821         fiy2             = _mm_setzero_ps();
822         fiz2             = _mm_setzero_ps();
823         fix3             = _mm_setzero_ps();
824         fiy3             = _mm_setzero_ps();
825         fiz3             = _mm_setzero_ps();
826
827         /* Start inner kernel loop */
828         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
829         {
830
831             /* Get j neighbor index, and coordinate index */
832             jnrA             = jjnr[jidx];
833             jnrB             = jjnr[jidx+1];
834             jnrC             = jjnr[jidx+2];
835             jnrD             = jjnr[jidx+3];
836             j_coord_offsetA  = DIM*jnrA;
837             j_coord_offsetB  = DIM*jnrB;
838             j_coord_offsetC  = DIM*jnrC;
839             j_coord_offsetD  = DIM*jnrD;
840
841             /* load j atom coordinates */
842             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
843                                               x+j_coord_offsetC,x+j_coord_offsetD,
844                                               &jx0,&jy0,&jz0);
845
846             /* Calculate displacement vector */
847             dx10             = _mm_sub_ps(ix1,jx0);
848             dy10             = _mm_sub_ps(iy1,jy0);
849             dz10             = _mm_sub_ps(iz1,jz0);
850             dx20             = _mm_sub_ps(ix2,jx0);
851             dy20             = _mm_sub_ps(iy2,jy0);
852             dz20             = _mm_sub_ps(iz2,jz0);
853             dx30             = _mm_sub_ps(ix3,jx0);
854             dy30             = _mm_sub_ps(iy3,jy0);
855             dz30             = _mm_sub_ps(iz3,jz0);
856
857             /* Calculate squared distance and things based on it */
858             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
859             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
860             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
861
862             rinv10           = gmx_mm_invsqrt_ps(rsq10);
863             rinv20           = gmx_mm_invsqrt_ps(rsq20);
864             rinv30           = gmx_mm_invsqrt_ps(rsq30);
865
866             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
867             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
868             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
869
870             /* Load parameters for j particles */
871             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
872                                                               charge+jnrC+0,charge+jnrD+0);
873
874             fjx0             = _mm_setzero_ps();
875             fjy0             = _mm_setzero_ps();
876             fjz0             = _mm_setzero_ps();
877
878             /**************************
879              * CALCULATE INTERACTIONS *
880              **************************/
881
882             if (gmx_mm_any_lt(rsq10,rcutoff2))
883             {
884
885             r10              = _mm_mul_ps(rsq10,rinv10);
886
887             /* Compute parameters for interactions between i and j atoms */
888             qq10             = _mm_mul_ps(iq1,jq0);
889
890             /* EWALD ELECTROSTATICS */
891
892             /* Analytical PME correction */
893             zeta2            = _mm_mul_ps(beta2,rsq10);
894             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
895             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
896             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
897             felec            = _mm_mul_ps(qq10,felec);
898             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
899             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
900             velec            = _mm_mul_ps(qq10,velec);
901
902             d                = _mm_sub_ps(r10,rswitch);
903             d                = _mm_max_ps(d,_mm_setzero_ps());
904             d2               = _mm_mul_ps(d,d);
905             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
906
907             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
908
909             /* Evaluate switch function */
910             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
911             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
912             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
913
914             fscal            = felec;
915
916             fscal            = _mm_and_ps(fscal,cutoff_mask);
917
918              /* Update vectorial force */
919             fix1             = _mm_macc_ps(dx10,fscal,fix1);
920             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
921             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
922
923             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
924             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
925             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
926
927             }
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             if (gmx_mm_any_lt(rsq20,rcutoff2))
934             {
935
936             r20              = _mm_mul_ps(rsq20,rinv20);
937
938             /* Compute parameters for interactions between i and j atoms */
939             qq20             = _mm_mul_ps(iq2,jq0);
940
941             /* EWALD ELECTROSTATICS */
942
943             /* Analytical PME correction */
944             zeta2            = _mm_mul_ps(beta2,rsq20);
945             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
946             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
947             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
948             felec            = _mm_mul_ps(qq20,felec);
949             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
950             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
951             velec            = _mm_mul_ps(qq20,velec);
952
953             d                = _mm_sub_ps(r20,rswitch);
954             d                = _mm_max_ps(d,_mm_setzero_ps());
955             d2               = _mm_mul_ps(d,d);
956             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
957
958             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
959
960             /* Evaluate switch function */
961             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
962             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
963             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
964
965             fscal            = felec;
966
967             fscal            = _mm_and_ps(fscal,cutoff_mask);
968
969              /* Update vectorial force */
970             fix2             = _mm_macc_ps(dx20,fscal,fix2);
971             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
972             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
973
974             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
975             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
976             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
977
978             }
979
980             /**************************
981              * CALCULATE INTERACTIONS *
982              **************************/
983
984             if (gmx_mm_any_lt(rsq30,rcutoff2))
985             {
986
987             r30              = _mm_mul_ps(rsq30,rinv30);
988
989             /* Compute parameters for interactions between i and j atoms */
990             qq30             = _mm_mul_ps(iq3,jq0);
991
992             /* EWALD ELECTROSTATICS */
993
994             /* Analytical PME correction */
995             zeta2            = _mm_mul_ps(beta2,rsq30);
996             rinv3            = _mm_mul_ps(rinvsq30,rinv30);
997             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
998             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
999             felec            = _mm_mul_ps(qq30,felec);
1000             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1001             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv30);
1002             velec            = _mm_mul_ps(qq30,velec);
1003
1004             d                = _mm_sub_ps(r30,rswitch);
1005             d                = _mm_max_ps(d,_mm_setzero_ps());
1006             d2               = _mm_mul_ps(d,d);
1007             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1008
1009             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1010
1011             /* Evaluate switch function */
1012             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1013             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1014             cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
1015
1016             fscal            = felec;
1017
1018             fscal            = _mm_and_ps(fscal,cutoff_mask);
1019
1020              /* Update vectorial force */
1021             fix3             = _mm_macc_ps(dx30,fscal,fix3);
1022             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
1023             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
1024
1025             fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
1026             fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
1027             fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
1028
1029             }
1030
1031             fjptrA             = f+j_coord_offsetA;
1032             fjptrB             = f+j_coord_offsetB;
1033             fjptrC             = f+j_coord_offsetC;
1034             fjptrD             = f+j_coord_offsetD;
1035
1036             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1037
1038             /* Inner loop uses 150 flops */
1039         }
1040
1041         if(jidx<j_index_end)
1042         {
1043
1044             /* Get j neighbor index, and coordinate index */
1045             jnrlistA         = jjnr[jidx];
1046             jnrlistB         = jjnr[jidx+1];
1047             jnrlistC         = jjnr[jidx+2];
1048             jnrlistD         = jjnr[jidx+3];
1049             /* Sign of each element will be negative for non-real atoms.
1050              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1051              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1052              */
1053             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1054             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1055             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1056             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1057             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1058             j_coord_offsetA  = DIM*jnrA;
1059             j_coord_offsetB  = DIM*jnrB;
1060             j_coord_offsetC  = DIM*jnrC;
1061             j_coord_offsetD  = DIM*jnrD;
1062
1063             /* load j atom coordinates */
1064             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1065                                               x+j_coord_offsetC,x+j_coord_offsetD,
1066                                               &jx0,&jy0,&jz0);
1067
1068             /* Calculate displacement vector */
1069             dx10             = _mm_sub_ps(ix1,jx0);
1070             dy10             = _mm_sub_ps(iy1,jy0);
1071             dz10             = _mm_sub_ps(iz1,jz0);
1072             dx20             = _mm_sub_ps(ix2,jx0);
1073             dy20             = _mm_sub_ps(iy2,jy0);
1074             dz20             = _mm_sub_ps(iz2,jz0);
1075             dx30             = _mm_sub_ps(ix3,jx0);
1076             dy30             = _mm_sub_ps(iy3,jy0);
1077             dz30             = _mm_sub_ps(iz3,jz0);
1078
1079             /* Calculate squared distance and things based on it */
1080             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1081             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1082             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1083
1084             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1085             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1086             rinv30           = gmx_mm_invsqrt_ps(rsq30);
1087
1088             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1089             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1090             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
1091
1092             /* Load parameters for j particles */
1093             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1094                                                               charge+jnrC+0,charge+jnrD+0);
1095
1096             fjx0             = _mm_setzero_ps();
1097             fjy0             = _mm_setzero_ps();
1098             fjz0             = _mm_setzero_ps();
1099
1100             /**************************
1101              * CALCULATE INTERACTIONS *
1102              **************************/
1103
1104             if (gmx_mm_any_lt(rsq10,rcutoff2))
1105             {
1106
1107             r10              = _mm_mul_ps(rsq10,rinv10);
1108             r10              = _mm_andnot_ps(dummy_mask,r10);
1109
1110             /* Compute parameters for interactions between i and j atoms */
1111             qq10             = _mm_mul_ps(iq1,jq0);
1112
1113             /* EWALD ELECTROSTATICS */
1114
1115             /* Analytical PME correction */
1116             zeta2            = _mm_mul_ps(beta2,rsq10);
1117             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1118             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1119             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1120             felec            = _mm_mul_ps(qq10,felec);
1121             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1122             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1123             velec            = _mm_mul_ps(qq10,velec);
1124
1125             d                = _mm_sub_ps(r10,rswitch);
1126             d                = _mm_max_ps(d,_mm_setzero_ps());
1127             d2               = _mm_mul_ps(d,d);
1128             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1129
1130             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1131
1132             /* Evaluate switch function */
1133             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1134             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1135             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1136
1137             fscal            = felec;
1138
1139             fscal            = _mm_and_ps(fscal,cutoff_mask);
1140
1141             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1142
1143              /* Update vectorial force */
1144             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1145             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1146             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1147
1148             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1149             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1150             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1151
1152             }
1153
1154             /**************************
1155              * CALCULATE INTERACTIONS *
1156              **************************/
1157
1158             if (gmx_mm_any_lt(rsq20,rcutoff2))
1159             {
1160
1161             r20              = _mm_mul_ps(rsq20,rinv20);
1162             r20              = _mm_andnot_ps(dummy_mask,r20);
1163
1164             /* Compute parameters for interactions between i and j atoms */
1165             qq20             = _mm_mul_ps(iq2,jq0);
1166
1167             /* EWALD ELECTROSTATICS */
1168
1169             /* Analytical PME correction */
1170             zeta2            = _mm_mul_ps(beta2,rsq20);
1171             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1172             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1173             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1174             felec            = _mm_mul_ps(qq20,felec);
1175             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1176             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1177             velec            = _mm_mul_ps(qq20,velec);
1178
1179             d                = _mm_sub_ps(r20,rswitch);
1180             d                = _mm_max_ps(d,_mm_setzero_ps());
1181             d2               = _mm_mul_ps(d,d);
1182             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1183
1184             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1185
1186             /* Evaluate switch function */
1187             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1188             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1189             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1190
1191             fscal            = felec;
1192
1193             fscal            = _mm_and_ps(fscal,cutoff_mask);
1194
1195             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1196
1197              /* Update vectorial force */
1198             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1199             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1200             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1201
1202             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1203             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1204             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1205
1206             }
1207
1208             /**************************
1209              * CALCULATE INTERACTIONS *
1210              **************************/
1211
1212             if (gmx_mm_any_lt(rsq30,rcutoff2))
1213             {
1214
1215             r30              = _mm_mul_ps(rsq30,rinv30);
1216             r30              = _mm_andnot_ps(dummy_mask,r30);
1217
1218             /* Compute parameters for interactions between i and j atoms */
1219             qq30             = _mm_mul_ps(iq3,jq0);
1220
1221             /* EWALD ELECTROSTATICS */
1222
1223             /* Analytical PME correction */
1224             zeta2            = _mm_mul_ps(beta2,rsq30);
1225             rinv3            = _mm_mul_ps(rinvsq30,rinv30);
1226             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1227             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1228             felec            = _mm_mul_ps(qq30,felec);
1229             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1230             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv30);
1231             velec            = _mm_mul_ps(qq30,velec);
1232
1233             d                = _mm_sub_ps(r30,rswitch);
1234             d                = _mm_max_ps(d,_mm_setzero_ps());
1235             d2               = _mm_mul_ps(d,d);
1236             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1237
1238             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1239
1240             /* Evaluate switch function */
1241             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1242             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1243             cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
1244
1245             fscal            = felec;
1246
1247             fscal            = _mm_and_ps(fscal,cutoff_mask);
1248
1249             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1250
1251              /* Update vectorial force */
1252             fix3             = _mm_macc_ps(dx30,fscal,fix3);
1253             fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
1254             fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
1255
1256             fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
1257             fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
1258             fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
1259
1260             }
1261
1262             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1263             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1264             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1265             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1266
1267             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1268
1269             /* Inner loop uses 153 flops */
1270         }
1271
1272         /* End of innermost loop */
1273
1274         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1275                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1276
1277         /* Increment number of inner iterations */
1278         inneriter                  += j_index_end - j_index_start;
1279
1280         /* Outer loop uses 18 flops */
1281     }
1282
1283     /* Increment number of outer iterations */
1284     outeriter        += nri;
1285
1286     /* Update outer/inner flops */
1287
1288     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*153);
1289 }