Introduce gmxpre.h for truly global definitions
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSw_VdwNone_GeomW3P1_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            None
56  * Geometry:                   Water3-Particle
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
97     real             *charge;
98     __m128i          ewitab;
99     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
101     real             *ewtab;
102     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
103     real             rswitch_scalar,d_scalar;
104     __m128           dummy_mask,cutoff_mask;
105     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
106     __m128           one     = _mm_set1_ps(1.0);
107     __m128           two     = _mm_set1_ps(2.0);
108     x                = xx[0];
109     f                = ff[0];
110
111     nri              = nlist->nri;
112     iinr             = nlist->iinr;
113     jindex           = nlist->jindex;
114     jjnr             = nlist->jjnr;
115     shiftidx         = nlist->shift;
116     gid              = nlist->gid;
117     shiftvec         = fr->shift_vec[0];
118     fshift           = fr->fshift[0];
119     facel            = _mm_set1_ps(fr->epsfac);
120     charge           = mdatoms->chargeA;
121
122     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
123     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
124     beta2            = _mm_mul_ps(beta,beta);
125     beta3            = _mm_mul_ps(beta,beta2);
126     ewtab            = fr->ic->tabq_coul_FDV0;
127     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
128     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
129
130     /* Setup water-specific parameters */
131     inr              = nlist->iinr[0];
132     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
133     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135
136     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
137     rcutoff_scalar   = fr->rcoulomb;
138     rcutoff          = _mm_set1_ps(rcutoff_scalar);
139     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
140
141     rswitch_scalar   = fr->rcoulomb_switch;
142     rswitch          = _mm_set1_ps(rswitch_scalar);
143     /* Setup switch parameters */
144     d_scalar         = rcutoff_scalar-rswitch_scalar;
145     d                = _mm_set1_ps(d_scalar);
146     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
147     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
148     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
149     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
150     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
152
153     /* Avoid stupid compiler warnings */
154     jnrA = jnrB = jnrC = jnrD = 0;
155     j_coord_offsetA = 0;
156     j_coord_offsetB = 0;
157     j_coord_offsetC = 0;
158     j_coord_offsetD = 0;
159
160     outeriter        = 0;
161     inneriter        = 0;
162
163     for(iidx=0;iidx<4*DIM;iidx++)
164     {
165         scratch[iidx] = 0.0;
166     }
167
168     /* Start outer loop over neighborlists */
169     for(iidx=0; iidx<nri; iidx++)
170     {
171         /* Load shift vector for this list */
172         i_shift_offset   = DIM*shiftidx[iidx];
173
174         /* Load limits for loop over neighbors */
175         j_index_start    = jindex[iidx];
176         j_index_end      = jindex[iidx+1];
177
178         /* Get outer coordinate index */
179         inr              = iinr[iidx];
180         i_coord_offset   = DIM*inr;
181
182         /* Load i particle coords and add shift vector */
183         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
184                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
185
186         fix0             = _mm_setzero_ps();
187         fiy0             = _mm_setzero_ps();
188         fiz0             = _mm_setzero_ps();
189         fix1             = _mm_setzero_ps();
190         fiy1             = _mm_setzero_ps();
191         fiz1             = _mm_setzero_ps();
192         fix2             = _mm_setzero_ps();
193         fiy2             = _mm_setzero_ps();
194         fiz2             = _mm_setzero_ps();
195
196         /* Reset potential sums */
197         velecsum         = _mm_setzero_ps();
198
199         /* Start inner kernel loop */
200         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201         {
202
203             /* Get j neighbor index, and coordinate index */
204             jnrA             = jjnr[jidx];
205             jnrB             = jjnr[jidx+1];
206             jnrC             = jjnr[jidx+2];
207             jnrD             = jjnr[jidx+3];
208             j_coord_offsetA  = DIM*jnrA;
209             j_coord_offsetB  = DIM*jnrB;
210             j_coord_offsetC  = DIM*jnrC;
211             j_coord_offsetD  = DIM*jnrD;
212
213             /* load j atom coordinates */
214             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
215                                               x+j_coord_offsetC,x+j_coord_offsetD,
216                                               &jx0,&jy0,&jz0);
217
218             /* Calculate displacement vector */
219             dx00             = _mm_sub_ps(ix0,jx0);
220             dy00             = _mm_sub_ps(iy0,jy0);
221             dz00             = _mm_sub_ps(iz0,jz0);
222             dx10             = _mm_sub_ps(ix1,jx0);
223             dy10             = _mm_sub_ps(iy1,jy0);
224             dz10             = _mm_sub_ps(iz1,jz0);
225             dx20             = _mm_sub_ps(ix2,jx0);
226             dy20             = _mm_sub_ps(iy2,jy0);
227             dz20             = _mm_sub_ps(iz2,jz0);
228
229             /* Calculate squared distance and things based on it */
230             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
231             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
232             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
233
234             rinv00           = gmx_mm_invsqrt_ps(rsq00);
235             rinv10           = gmx_mm_invsqrt_ps(rsq10);
236             rinv20           = gmx_mm_invsqrt_ps(rsq20);
237
238             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
239             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
240             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
241
242             /* Load parameters for j particles */
243             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
244                                                               charge+jnrC+0,charge+jnrD+0);
245
246             fjx0             = _mm_setzero_ps();
247             fjy0             = _mm_setzero_ps();
248             fjz0             = _mm_setzero_ps();
249
250             /**************************
251              * CALCULATE INTERACTIONS *
252              **************************/
253
254             if (gmx_mm_any_lt(rsq00,rcutoff2))
255             {
256
257             r00              = _mm_mul_ps(rsq00,rinv00);
258
259             /* Compute parameters for interactions between i and j atoms */
260             qq00             = _mm_mul_ps(iq0,jq0);
261
262             /* EWALD ELECTROSTATICS */
263
264             /* Analytical PME correction */
265             zeta2            = _mm_mul_ps(beta2,rsq00);
266             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
267             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
268             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
269             felec            = _mm_mul_ps(qq00,felec);
270             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
271             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
272             velec            = _mm_mul_ps(qq00,velec);
273
274             d                = _mm_sub_ps(r00,rswitch);
275             d                = _mm_max_ps(d,_mm_setzero_ps());
276             d2               = _mm_mul_ps(d,d);
277             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
278
279             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
280
281             /* Evaluate switch function */
282             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
283             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
284             velec            = _mm_mul_ps(velec,sw);
285             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
286
287             /* Update potential sum for this i atom from the interaction with this j atom. */
288             velec            = _mm_and_ps(velec,cutoff_mask);
289             velecsum         = _mm_add_ps(velecsum,velec);
290
291             fscal            = felec;
292
293             fscal            = _mm_and_ps(fscal,cutoff_mask);
294
295              /* Update vectorial force */
296             fix0             = _mm_macc_ps(dx00,fscal,fix0);
297             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
298             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
299
300             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
301             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
302             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
303
304             }
305
306             /**************************
307              * CALCULATE INTERACTIONS *
308              **************************/
309
310             if (gmx_mm_any_lt(rsq10,rcutoff2))
311             {
312
313             r10              = _mm_mul_ps(rsq10,rinv10);
314
315             /* Compute parameters for interactions between i and j atoms */
316             qq10             = _mm_mul_ps(iq1,jq0);
317
318             /* EWALD ELECTROSTATICS */
319
320             /* Analytical PME correction */
321             zeta2            = _mm_mul_ps(beta2,rsq10);
322             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
323             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
324             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
325             felec            = _mm_mul_ps(qq10,felec);
326             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
327             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
328             velec            = _mm_mul_ps(qq10,velec);
329
330             d                = _mm_sub_ps(r10,rswitch);
331             d                = _mm_max_ps(d,_mm_setzero_ps());
332             d2               = _mm_mul_ps(d,d);
333             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
334
335             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
336
337             /* Evaluate switch function */
338             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
339             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
340             velec            = _mm_mul_ps(velec,sw);
341             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
342
343             /* Update potential sum for this i atom from the interaction with this j atom. */
344             velec            = _mm_and_ps(velec,cutoff_mask);
345             velecsum         = _mm_add_ps(velecsum,velec);
346
347             fscal            = felec;
348
349             fscal            = _mm_and_ps(fscal,cutoff_mask);
350
351              /* Update vectorial force */
352             fix1             = _mm_macc_ps(dx10,fscal,fix1);
353             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
354             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
355
356             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
357             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
358             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
359
360             }
361
362             /**************************
363              * CALCULATE INTERACTIONS *
364              **************************/
365
366             if (gmx_mm_any_lt(rsq20,rcutoff2))
367             {
368
369             r20              = _mm_mul_ps(rsq20,rinv20);
370
371             /* Compute parameters for interactions between i and j atoms */
372             qq20             = _mm_mul_ps(iq2,jq0);
373
374             /* EWALD ELECTROSTATICS */
375
376             /* Analytical PME correction */
377             zeta2            = _mm_mul_ps(beta2,rsq20);
378             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
379             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
380             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
381             felec            = _mm_mul_ps(qq20,felec);
382             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
383             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
384             velec            = _mm_mul_ps(qq20,velec);
385
386             d                = _mm_sub_ps(r20,rswitch);
387             d                = _mm_max_ps(d,_mm_setzero_ps());
388             d2               = _mm_mul_ps(d,d);
389             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
390
391             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
392
393             /* Evaluate switch function */
394             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
395             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
396             velec            = _mm_mul_ps(velec,sw);
397             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
398
399             /* Update potential sum for this i atom from the interaction with this j atom. */
400             velec            = _mm_and_ps(velec,cutoff_mask);
401             velecsum         = _mm_add_ps(velecsum,velec);
402
403             fscal            = felec;
404
405             fscal            = _mm_and_ps(fscal,cutoff_mask);
406
407              /* Update vectorial force */
408             fix2             = _mm_macc_ps(dx20,fscal,fix2);
409             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
410             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
411
412             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
413             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
414             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
415
416             }
417
418             fjptrA             = f+j_coord_offsetA;
419             fjptrB             = f+j_coord_offsetB;
420             fjptrC             = f+j_coord_offsetC;
421             fjptrD             = f+j_coord_offsetD;
422
423             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
424
425             /* Inner loop uses 159 flops */
426         }
427
428         if(jidx<j_index_end)
429         {
430
431             /* Get j neighbor index, and coordinate index */
432             jnrlistA         = jjnr[jidx];
433             jnrlistB         = jjnr[jidx+1];
434             jnrlistC         = jjnr[jidx+2];
435             jnrlistD         = jjnr[jidx+3];
436             /* Sign of each element will be negative for non-real atoms.
437              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
438              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
439              */
440             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
441             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
442             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
443             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
444             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
445             j_coord_offsetA  = DIM*jnrA;
446             j_coord_offsetB  = DIM*jnrB;
447             j_coord_offsetC  = DIM*jnrC;
448             j_coord_offsetD  = DIM*jnrD;
449
450             /* load j atom coordinates */
451             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
452                                               x+j_coord_offsetC,x+j_coord_offsetD,
453                                               &jx0,&jy0,&jz0);
454
455             /* Calculate displacement vector */
456             dx00             = _mm_sub_ps(ix0,jx0);
457             dy00             = _mm_sub_ps(iy0,jy0);
458             dz00             = _mm_sub_ps(iz0,jz0);
459             dx10             = _mm_sub_ps(ix1,jx0);
460             dy10             = _mm_sub_ps(iy1,jy0);
461             dz10             = _mm_sub_ps(iz1,jz0);
462             dx20             = _mm_sub_ps(ix2,jx0);
463             dy20             = _mm_sub_ps(iy2,jy0);
464             dz20             = _mm_sub_ps(iz2,jz0);
465
466             /* Calculate squared distance and things based on it */
467             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
468             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
469             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
470
471             rinv00           = gmx_mm_invsqrt_ps(rsq00);
472             rinv10           = gmx_mm_invsqrt_ps(rsq10);
473             rinv20           = gmx_mm_invsqrt_ps(rsq20);
474
475             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
476             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
477             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
478
479             /* Load parameters for j particles */
480             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
481                                                               charge+jnrC+0,charge+jnrD+0);
482
483             fjx0             = _mm_setzero_ps();
484             fjy0             = _mm_setzero_ps();
485             fjz0             = _mm_setzero_ps();
486
487             /**************************
488              * CALCULATE INTERACTIONS *
489              **************************/
490
491             if (gmx_mm_any_lt(rsq00,rcutoff2))
492             {
493
494             r00              = _mm_mul_ps(rsq00,rinv00);
495             r00              = _mm_andnot_ps(dummy_mask,r00);
496
497             /* Compute parameters for interactions between i and j atoms */
498             qq00             = _mm_mul_ps(iq0,jq0);
499
500             /* EWALD ELECTROSTATICS */
501
502             /* Analytical PME correction */
503             zeta2            = _mm_mul_ps(beta2,rsq00);
504             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
505             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
506             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
507             felec            = _mm_mul_ps(qq00,felec);
508             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
509             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
510             velec            = _mm_mul_ps(qq00,velec);
511
512             d                = _mm_sub_ps(r00,rswitch);
513             d                = _mm_max_ps(d,_mm_setzero_ps());
514             d2               = _mm_mul_ps(d,d);
515             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
516
517             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
518
519             /* Evaluate switch function */
520             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
521             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
522             velec            = _mm_mul_ps(velec,sw);
523             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
524
525             /* Update potential sum for this i atom from the interaction with this j atom. */
526             velec            = _mm_and_ps(velec,cutoff_mask);
527             velec            = _mm_andnot_ps(dummy_mask,velec);
528             velecsum         = _mm_add_ps(velecsum,velec);
529
530             fscal            = felec;
531
532             fscal            = _mm_and_ps(fscal,cutoff_mask);
533
534             fscal            = _mm_andnot_ps(dummy_mask,fscal);
535
536              /* Update vectorial force */
537             fix0             = _mm_macc_ps(dx00,fscal,fix0);
538             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
539             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
540
541             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
542             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
543             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
544
545             }
546
547             /**************************
548              * CALCULATE INTERACTIONS *
549              **************************/
550
551             if (gmx_mm_any_lt(rsq10,rcutoff2))
552             {
553
554             r10              = _mm_mul_ps(rsq10,rinv10);
555             r10              = _mm_andnot_ps(dummy_mask,r10);
556
557             /* Compute parameters for interactions between i and j atoms */
558             qq10             = _mm_mul_ps(iq1,jq0);
559
560             /* EWALD ELECTROSTATICS */
561
562             /* Analytical PME correction */
563             zeta2            = _mm_mul_ps(beta2,rsq10);
564             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
565             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
566             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
567             felec            = _mm_mul_ps(qq10,felec);
568             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
569             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
570             velec            = _mm_mul_ps(qq10,velec);
571
572             d                = _mm_sub_ps(r10,rswitch);
573             d                = _mm_max_ps(d,_mm_setzero_ps());
574             d2               = _mm_mul_ps(d,d);
575             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
576
577             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
578
579             /* Evaluate switch function */
580             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
581             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
582             velec            = _mm_mul_ps(velec,sw);
583             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
584
585             /* Update potential sum for this i atom from the interaction with this j atom. */
586             velec            = _mm_and_ps(velec,cutoff_mask);
587             velec            = _mm_andnot_ps(dummy_mask,velec);
588             velecsum         = _mm_add_ps(velecsum,velec);
589
590             fscal            = felec;
591
592             fscal            = _mm_and_ps(fscal,cutoff_mask);
593
594             fscal            = _mm_andnot_ps(dummy_mask,fscal);
595
596              /* Update vectorial force */
597             fix1             = _mm_macc_ps(dx10,fscal,fix1);
598             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
599             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
600
601             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
602             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
603             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
604
605             }
606
607             /**************************
608              * CALCULATE INTERACTIONS *
609              **************************/
610
611             if (gmx_mm_any_lt(rsq20,rcutoff2))
612             {
613
614             r20              = _mm_mul_ps(rsq20,rinv20);
615             r20              = _mm_andnot_ps(dummy_mask,r20);
616
617             /* Compute parameters for interactions between i and j atoms */
618             qq20             = _mm_mul_ps(iq2,jq0);
619
620             /* EWALD ELECTROSTATICS */
621
622             /* Analytical PME correction */
623             zeta2            = _mm_mul_ps(beta2,rsq20);
624             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
625             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
626             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
627             felec            = _mm_mul_ps(qq20,felec);
628             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
629             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
630             velec            = _mm_mul_ps(qq20,velec);
631
632             d                = _mm_sub_ps(r20,rswitch);
633             d                = _mm_max_ps(d,_mm_setzero_ps());
634             d2               = _mm_mul_ps(d,d);
635             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
636
637             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
638
639             /* Evaluate switch function */
640             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
641             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
642             velec            = _mm_mul_ps(velec,sw);
643             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
644
645             /* Update potential sum for this i atom from the interaction with this j atom. */
646             velec            = _mm_and_ps(velec,cutoff_mask);
647             velec            = _mm_andnot_ps(dummy_mask,velec);
648             velecsum         = _mm_add_ps(velecsum,velec);
649
650             fscal            = felec;
651
652             fscal            = _mm_and_ps(fscal,cutoff_mask);
653
654             fscal            = _mm_andnot_ps(dummy_mask,fscal);
655
656              /* Update vectorial force */
657             fix2             = _mm_macc_ps(dx20,fscal,fix2);
658             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
659             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
660
661             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
662             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
663             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
664
665             }
666
667             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
668             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
669             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
670             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
671
672             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
673
674             /* Inner loop uses 162 flops */
675         }
676
677         /* End of innermost loop */
678
679         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
680                                               f+i_coord_offset,fshift+i_shift_offset);
681
682         ggid                        = gid[iidx];
683         /* Update potential energies */
684         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
685
686         /* Increment number of inner iterations */
687         inneriter                  += j_index_end - j_index_start;
688
689         /* Outer loop uses 19 flops */
690     }
691
692     /* Increment number of outer iterations */
693     outeriter        += nri;
694
695     /* Update outer/inner flops */
696
697     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*162);
698 }
699 /*
700  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
701  * Electrostatics interaction: Ewald
702  * VdW interaction:            None
703  * Geometry:                   Water3-Particle
704  * Calculate force/pot:        Force
705  */
706 void
707 nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_128_fma_single
708                     (t_nblist                    * gmx_restrict       nlist,
709                      rvec                        * gmx_restrict          xx,
710                      rvec                        * gmx_restrict          ff,
711                      t_forcerec                  * gmx_restrict          fr,
712                      t_mdatoms                   * gmx_restrict     mdatoms,
713                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
714                      t_nrnb                      * gmx_restrict        nrnb)
715 {
716     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
717      * just 0 for non-waters.
718      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
719      * jnr indices corresponding to data put in the four positions in the SIMD register.
720      */
721     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
722     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
723     int              jnrA,jnrB,jnrC,jnrD;
724     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
725     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
726     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
727     real             rcutoff_scalar;
728     real             *shiftvec,*fshift,*x,*f;
729     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
730     real             scratch[4*DIM];
731     __m128           fscal,rcutoff,rcutoff2,jidxall;
732     int              vdwioffset0;
733     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
734     int              vdwioffset1;
735     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
736     int              vdwioffset2;
737     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
738     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
739     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
740     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
741     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
742     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
743     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
744     real             *charge;
745     __m128i          ewitab;
746     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
747     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
748     real             *ewtab;
749     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
750     real             rswitch_scalar,d_scalar;
751     __m128           dummy_mask,cutoff_mask;
752     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
753     __m128           one     = _mm_set1_ps(1.0);
754     __m128           two     = _mm_set1_ps(2.0);
755     x                = xx[0];
756     f                = ff[0];
757
758     nri              = nlist->nri;
759     iinr             = nlist->iinr;
760     jindex           = nlist->jindex;
761     jjnr             = nlist->jjnr;
762     shiftidx         = nlist->shift;
763     gid              = nlist->gid;
764     shiftvec         = fr->shift_vec[0];
765     fshift           = fr->fshift[0];
766     facel            = _mm_set1_ps(fr->epsfac);
767     charge           = mdatoms->chargeA;
768
769     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
770     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
771     beta2            = _mm_mul_ps(beta,beta);
772     beta3            = _mm_mul_ps(beta,beta2);
773     ewtab            = fr->ic->tabq_coul_FDV0;
774     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
775     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
776
777     /* Setup water-specific parameters */
778     inr              = nlist->iinr[0];
779     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
780     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
781     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
782
783     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
784     rcutoff_scalar   = fr->rcoulomb;
785     rcutoff          = _mm_set1_ps(rcutoff_scalar);
786     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
787
788     rswitch_scalar   = fr->rcoulomb_switch;
789     rswitch          = _mm_set1_ps(rswitch_scalar);
790     /* Setup switch parameters */
791     d_scalar         = rcutoff_scalar-rswitch_scalar;
792     d                = _mm_set1_ps(d_scalar);
793     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
794     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
795     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
796     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
797     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
798     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
799
800     /* Avoid stupid compiler warnings */
801     jnrA = jnrB = jnrC = jnrD = 0;
802     j_coord_offsetA = 0;
803     j_coord_offsetB = 0;
804     j_coord_offsetC = 0;
805     j_coord_offsetD = 0;
806
807     outeriter        = 0;
808     inneriter        = 0;
809
810     for(iidx=0;iidx<4*DIM;iidx++)
811     {
812         scratch[iidx] = 0.0;
813     }
814
815     /* Start outer loop over neighborlists */
816     for(iidx=0; iidx<nri; iidx++)
817     {
818         /* Load shift vector for this list */
819         i_shift_offset   = DIM*shiftidx[iidx];
820
821         /* Load limits for loop over neighbors */
822         j_index_start    = jindex[iidx];
823         j_index_end      = jindex[iidx+1];
824
825         /* Get outer coordinate index */
826         inr              = iinr[iidx];
827         i_coord_offset   = DIM*inr;
828
829         /* Load i particle coords and add shift vector */
830         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
831                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
832
833         fix0             = _mm_setzero_ps();
834         fiy0             = _mm_setzero_ps();
835         fiz0             = _mm_setzero_ps();
836         fix1             = _mm_setzero_ps();
837         fiy1             = _mm_setzero_ps();
838         fiz1             = _mm_setzero_ps();
839         fix2             = _mm_setzero_ps();
840         fiy2             = _mm_setzero_ps();
841         fiz2             = _mm_setzero_ps();
842
843         /* Start inner kernel loop */
844         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
845         {
846
847             /* Get j neighbor index, and coordinate index */
848             jnrA             = jjnr[jidx];
849             jnrB             = jjnr[jidx+1];
850             jnrC             = jjnr[jidx+2];
851             jnrD             = jjnr[jidx+3];
852             j_coord_offsetA  = DIM*jnrA;
853             j_coord_offsetB  = DIM*jnrB;
854             j_coord_offsetC  = DIM*jnrC;
855             j_coord_offsetD  = DIM*jnrD;
856
857             /* load j atom coordinates */
858             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
859                                               x+j_coord_offsetC,x+j_coord_offsetD,
860                                               &jx0,&jy0,&jz0);
861
862             /* Calculate displacement vector */
863             dx00             = _mm_sub_ps(ix0,jx0);
864             dy00             = _mm_sub_ps(iy0,jy0);
865             dz00             = _mm_sub_ps(iz0,jz0);
866             dx10             = _mm_sub_ps(ix1,jx0);
867             dy10             = _mm_sub_ps(iy1,jy0);
868             dz10             = _mm_sub_ps(iz1,jz0);
869             dx20             = _mm_sub_ps(ix2,jx0);
870             dy20             = _mm_sub_ps(iy2,jy0);
871             dz20             = _mm_sub_ps(iz2,jz0);
872
873             /* Calculate squared distance and things based on it */
874             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
875             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
876             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
877
878             rinv00           = gmx_mm_invsqrt_ps(rsq00);
879             rinv10           = gmx_mm_invsqrt_ps(rsq10);
880             rinv20           = gmx_mm_invsqrt_ps(rsq20);
881
882             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
883             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
884             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
885
886             /* Load parameters for j particles */
887             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
888                                                               charge+jnrC+0,charge+jnrD+0);
889
890             fjx0             = _mm_setzero_ps();
891             fjy0             = _mm_setzero_ps();
892             fjz0             = _mm_setzero_ps();
893
894             /**************************
895              * CALCULATE INTERACTIONS *
896              **************************/
897
898             if (gmx_mm_any_lt(rsq00,rcutoff2))
899             {
900
901             r00              = _mm_mul_ps(rsq00,rinv00);
902
903             /* Compute parameters for interactions between i and j atoms */
904             qq00             = _mm_mul_ps(iq0,jq0);
905
906             /* EWALD ELECTROSTATICS */
907
908             /* Analytical PME correction */
909             zeta2            = _mm_mul_ps(beta2,rsq00);
910             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
911             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
912             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
913             felec            = _mm_mul_ps(qq00,felec);
914             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
915             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
916             velec            = _mm_mul_ps(qq00,velec);
917
918             d                = _mm_sub_ps(r00,rswitch);
919             d                = _mm_max_ps(d,_mm_setzero_ps());
920             d2               = _mm_mul_ps(d,d);
921             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
922
923             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
924
925             /* Evaluate switch function */
926             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
927             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
928             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
929
930             fscal            = felec;
931
932             fscal            = _mm_and_ps(fscal,cutoff_mask);
933
934              /* Update vectorial force */
935             fix0             = _mm_macc_ps(dx00,fscal,fix0);
936             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
937             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
938
939             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
940             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
941             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
942
943             }
944
945             /**************************
946              * CALCULATE INTERACTIONS *
947              **************************/
948
949             if (gmx_mm_any_lt(rsq10,rcutoff2))
950             {
951
952             r10              = _mm_mul_ps(rsq10,rinv10);
953
954             /* Compute parameters for interactions between i and j atoms */
955             qq10             = _mm_mul_ps(iq1,jq0);
956
957             /* EWALD ELECTROSTATICS */
958
959             /* Analytical PME correction */
960             zeta2            = _mm_mul_ps(beta2,rsq10);
961             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
962             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
963             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
964             felec            = _mm_mul_ps(qq10,felec);
965             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
966             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
967             velec            = _mm_mul_ps(qq10,velec);
968
969             d                = _mm_sub_ps(r10,rswitch);
970             d                = _mm_max_ps(d,_mm_setzero_ps());
971             d2               = _mm_mul_ps(d,d);
972             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
973
974             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
975
976             /* Evaluate switch function */
977             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
978             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
979             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
980
981             fscal            = felec;
982
983             fscal            = _mm_and_ps(fscal,cutoff_mask);
984
985              /* Update vectorial force */
986             fix1             = _mm_macc_ps(dx10,fscal,fix1);
987             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
988             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
989
990             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
991             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
992             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
993
994             }
995
996             /**************************
997              * CALCULATE INTERACTIONS *
998              **************************/
999
1000             if (gmx_mm_any_lt(rsq20,rcutoff2))
1001             {
1002
1003             r20              = _mm_mul_ps(rsq20,rinv20);
1004
1005             /* Compute parameters for interactions between i and j atoms */
1006             qq20             = _mm_mul_ps(iq2,jq0);
1007
1008             /* EWALD ELECTROSTATICS */
1009
1010             /* Analytical PME correction */
1011             zeta2            = _mm_mul_ps(beta2,rsq20);
1012             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1013             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1014             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1015             felec            = _mm_mul_ps(qq20,felec);
1016             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1017             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1018             velec            = _mm_mul_ps(qq20,velec);
1019
1020             d                = _mm_sub_ps(r20,rswitch);
1021             d                = _mm_max_ps(d,_mm_setzero_ps());
1022             d2               = _mm_mul_ps(d,d);
1023             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1024
1025             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1026
1027             /* Evaluate switch function */
1028             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1029             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1030             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1031
1032             fscal            = felec;
1033
1034             fscal            = _mm_and_ps(fscal,cutoff_mask);
1035
1036              /* Update vectorial force */
1037             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1038             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1039             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1040
1041             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1042             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1043             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1044
1045             }
1046
1047             fjptrA             = f+j_coord_offsetA;
1048             fjptrB             = f+j_coord_offsetB;
1049             fjptrC             = f+j_coord_offsetC;
1050             fjptrD             = f+j_coord_offsetD;
1051
1052             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1053
1054             /* Inner loop uses 150 flops */
1055         }
1056
1057         if(jidx<j_index_end)
1058         {
1059
1060             /* Get j neighbor index, and coordinate index */
1061             jnrlistA         = jjnr[jidx];
1062             jnrlistB         = jjnr[jidx+1];
1063             jnrlistC         = jjnr[jidx+2];
1064             jnrlistD         = jjnr[jidx+3];
1065             /* Sign of each element will be negative for non-real atoms.
1066              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1067              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1068              */
1069             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1070             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1071             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1072             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1073             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1074             j_coord_offsetA  = DIM*jnrA;
1075             j_coord_offsetB  = DIM*jnrB;
1076             j_coord_offsetC  = DIM*jnrC;
1077             j_coord_offsetD  = DIM*jnrD;
1078
1079             /* load j atom coordinates */
1080             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1081                                               x+j_coord_offsetC,x+j_coord_offsetD,
1082                                               &jx0,&jy0,&jz0);
1083
1084             /* Calculate displacement vector */
1085             dx00             = _mm_sub_ps(ix0,jx0);
1086             dy00             = _mm_sub_ps(iy0,jy0);
1087             dz00             = _mm_sub_ps(iz0,jz0);
1088             dx10             = _mm_sub_ps(ix1,jx0);
1089             dy10             = _mm_sub_ps(iy1,jy0);
1090             dz10             = _mm_sub_ps(iz1,jz0);
1091             dx20             = _mm_sub_ps(ix2,jx0);
1092             dy20             = _mm_sub_ps(iy2,jy0);
1093             dz20             = _mm_sub_ps(iz2,jz0);
1094
1095             /* Calculate squared distance and things based on it */
1096             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1097             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1098             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1099
1100             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1101             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1102             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1103
1104             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1105             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1106             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1107
1108             /* Load parameters for j particles */
1109             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1110                                                               charge+jnrC+0,charge+jnrD+0);
1111
1112             fjx0             = _mm_setzero_ps();
1113             fjy0             = _mm_setzero_ps();
1114             fjz0             = _mm_setzero_ps();
1115
1116             /**************************
1117              * CALCULATE INTERACTIONS *
1118              **************************/
1119
1120             if (gmx_mm_any_lt(rsq00,rcutoff2))
1121             {
1122
1123             r00              = _mm_mul_ps(rsq00,rinv00);
1124             r00              = _mm_andnot_ps(dummy_mask,r00);
1125
1126             /* Compute parameters for interactions between i and j atoms */
1127             qq00             = _mm_mul_ps(iq0,jq0);
1128
1129             /* EWALD ELECTROSTATICS */
1130
1131             /* Analytical PME correction */
1132             zeta2            = _mm_mul_ps(beta2,rsq00);
1133             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1134             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1135             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1136             felec            = _mm_mul_ps(qq00,felec);
1137             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1138             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
1139             velec            = _mm_mul_ps(qq00,velec);
1140
1141             d                = _mm_sub_ps(r00,rswitch);
1142             d                = _mm_max_ps(d,_mm_setzero_ps());
1143             d2               = _mm_mul_ps(d,d);
1144             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1145
1146             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1147
1148             /* Evaluate switch function */
1149             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1150             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1151             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1152
1153             fscal            = felec;
1154
1155             fscal            = _mm_and_ps(fscal,cutoff_mask);
1156
1157             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1158
1159              /* Update vectorial force */
1160             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1161             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1162             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1163
1164             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1165             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1166             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1167
1168             }
1169
1170             /**************************
1171              * CALCULATE INTERACTIONS *
1172              **************************/
1173
1174             if (gmx_mm_any_lt(rsq10,rcutoff2))
1175             {
1176
1177             r10              = _mm_mul_ps(rsq10,rinv10);
1178             r10              = _mm_andnot_ps(dummy_mask,r10);
1179
1180             /* Compute parameters for interactions between i and j atoms */
1181             qq10             = _mm_mul_ps(iq1,jq0);
1182
1183             /* EWALD ELECTROSTATICS */
1184
1185             /* Analytical PME correction */
1186             zeta2            = _mm_mul_ps(beta2,rsq10);
1187             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1188             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1189             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1190             felec            = _mm_mul_ps(qq10,felec);
1191             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1192             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1193             velec            = _mm_mul_ps(qq10,velec);
1194
1195             d                = _mm_sub_ps(r10,rswitch);
1196             d                = _mm_max_ps(d,_mm_setzero_ps());
1197             d2               = _mm_mul_ps(d,d);
1198             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1199
1200             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1201
1202             /* Evaluate switch function */
1203             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1204             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1205             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1206
1207             fscal            = felec;
1208
1209             fscal            = _mm_and_ps(fscal,cutoff_mask);
1210
1211             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1212
1213              /* Update vectorial force */
1214             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1215             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1216             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1217
1218             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1219             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1220             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1221
1222             }
1223
1224             /**************************
1225              * CALCULATE INTERACTIONS *
1226              **************************/
1227
1228             if (gmx_mm_any_lt(rsq20,rcutoff2))
1229             {
1230
1231             r20              = _mm_mul_ps(rsq20,rinv20);
1232             r20              = _mm_andnot_ps(dummy_mask,r20);
1233
1234             /* Compute parameters for interactions between i and j atoms */
1235             qq20             = _mm_mul_ps(iq2,jq0);
1236
1237             /* EWALD ELECTROSTATICS */
1238
1239             /* Analytical PME correction */
1240             zeta2            = _mm_mul_ps(beta2,rsq20);
1241             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1242             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1243             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1244             felec            = _mm_mul_ps(qq20,felec);
1245             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1246             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1247             velec            = _mm_mul_ps(qq20,velec);
1248
1249             d                = _mm_sub_ps(r20,rswitch);
1250             d                = _mm_max_ps(d,_mm_setzero_ps());
1251             d2               = _mm_mul_ps(d,d);
1252             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1253
1254             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1255
1256             /* Evaluate switch function */
1257             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1258             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1259             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1260
1261             fscal            = felec;
1262
1263             fscal            = _mm_and_ps(fscal,cutoff_mask);
1264
1265             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1266
1267              /* Update vectorial force */
1268             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1269             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1270             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1271
1272             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1273             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1274             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1275
1276             }
1277
1278             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1279             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1280             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1281             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1282
1283             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1284
1285             /* Inner loop uses 153 flops */
1286         }
1287
1288         /* End of innermost loop */
1289
1290         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1291                                               f+i_coord_offset,fshift+i_shift_offset);
1292
1293         /* Increment number of inner iterations */
1294         inneriter                  += j_index_end - j_index_start;
1295
1296         /* Outer loop uses 18 flops */
1297     }
1298
1299     /* Increment number of outer iterations */
1300     outeriter        += nri;
1301
1302     /* Update outer/inner flops */
1303
1304     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*153);
1305 }