Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCoul_VdwNone_GeomW4P1_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_double
51  * Electrostatics interaction: Coulomb
52  * VdW interaction:            None
53  * Geometry:                   Water4-Particle
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr1;
84     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85     real *           vdwioffsetptr2;
86     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87     real *           vdwioffsetptr3;
88     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
92     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
93     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
94     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
95     real             *charge;
96     __m256d          dummy_mask,cutoff_mask;
97     __m128           tmpmask0,tmpmask1;
98     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
99     __m256d          one     = _mm256_set1_pd(1.0);
100     __m256d          two     = _mm256_set1_pd(2.0);
101     x                = xx[0];
102     f                = ff[0];
103
104     nri              = nlist->nri;
105     iinr             = nlist->iinr;
106     jindex           = nlist->jindex;
107     jjnr             = nlist->jjnr;
108     shiftidx         = nlist->shift;
109     gid              = nlist->gid;
110     shiftvec         = fr->shift_vec[0];
111     fshift           = fr->fshift[0];
112     facel            = _mm256_set1_pd(fr->ic->epsfac);
113     charge           = mdatoms->chargeA;
114
115     /* Setup water-specific parameters */
116     inr              = nlist->iinr[0];
117     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
118     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
119     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
120
121     /* Avoid stupid compiler warnings */
122     jnrA = jnrB = jnrC = jnrD = 0;
123     j_coord_offsetA = 0;
124     j_coord_offsetB = 0;
125     j_coord_offsetC = 0;
126     j_coord_offsetD = 0;
127
128     outeriter        = 0;
129     inneriter        = 0;
130
131     for(iidx=0;iidx<4*DIM;iidx++)
132     {
133         scratch[iidx] = 0.0;
134     }
135
136     /* Start outer loop over neighborlists */
137     for(iidx=0; iidx<nri; iidx++)
138     {
139         /* Load shift vector for this list */
140         i_shift_offset   = DIM*shiftidx[iidx];
141
142         /* Load limits for loop over neighbors */
143         j_index_start    = jindex[iidx];
144         j_index_end      = jindex[iidx+1];
145
146         /* Get outer coordinate index */
147         inr              = iinr[iidx];
148         i_coord_offset   = DIM*inr;
149
150         /* Load i particle coords and add shift vector */
151         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
152                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
153
154         fix1             = _mm256_setzero_pd();
155         fiy1             = _mm256_setzero_pd();
156         fiz1             = _mm256_setzero_pd();
157         fix2             = _mm256_setzero_pd();
158         fiy2             = _mm256_setzero_pd();
159         fiz2             = _mm256_setzero_pd();
160         fix3             = _mm256_setzero_pd();
161         fiy3             = _mm256_setzero_pd();
162         fiz3             = _mm256_setzero_pd();
163
164         /* Reset potential sums */
165         velecsum         = _mm256_setzero_pd();
166
167         /* Start inner kernel loop */
168         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
169         {
170
171             /* Get j neighbor index, and coordinate index */
172             jnrA             = jjnr[jidx];
173             jnrB             = jjnr[jidx+1];
174             jnrC             = jjnr[jidx+2];
175             jnrD             = jjnr[jidx+3];
176             j_coord_offsetA  = DIM*jnrA;
177             j_coord_offsetB  = DIM*jnrB;
178             j_coord_offsetC  = DIM*jnrC;
179             j_coord_offsetD  = DIM*jnrD;
180
181             /* load j atom coordinates */
182             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
183                                                  x+j_coord_offsetC,x+j_coord_offsetD,
184                                                  &jx0,&jy0,&jz0);
185
186             /* Calculate displacement vector */
187             dx10             = _mm256_sub_pd(ix1,jx0);
188             dy10             = _mm256_sub_pd(iy1,jy0);
189             dz10             = _mm256_sub_pd(iz1,jz0);
190             dx20             = _mm256_sub_pd(ix2,jx0);
191             dy20             = _mm256_sub_pd(iy2,jy0);
192             dz20             = _mm256_sub_pd(iz2,jz0);
193             dx30             = _mm256_sub_pd(ix3,jx0);
194             dy30             = _mm256_sub_pd(iy3,jy0);
195             dz30             = _mm256_sub_pd(iz3,jz0);
196
197             /* Calculate squared distance and things based on it */
198             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
199             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
200             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
201
202             rinv10           = avx256_invsqrt_d(rsq10);
203             rinv20           = avx256_invsqrt_d(rsq20);
204             rinv30           = avx256_invsqrt_d(rsq30);
205
206             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
207             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
208             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
209
210             /* Load parameters for j particles */
211             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
212                                                                  charge+jnrC+0,charge+jnrD+0);
213
214             fjx0             = _mm256_setzero_pd();
215             fjy0             = _mm256_setzero_pd();
216             fjz0             = _mm256_setzero_pd();
217
218             /**************************
219              * CALCULATE INTERACTIONS *
220              **************************/
221
222             /* Compute parameters for interactions between i and j atoms */
223             qq10             = _mm256_mul_pd(iq1,jq0);
224
225             /* COULOMB ELECTROSTATICS */
226             velec            = _mm256_mul_pd(qq10,rinv10);
227             felec            = _mm256_mul_pd(velec,rinvsq10);
228
229             /* Update potential sum for this i atom from the interaction with this j atom. */
230             velecsum         = _mm256_add_pd(velecsum,velec);
231
232             fscal            = felec;
233
234             /* Calculate temporary vectorial force */
235             tx               = _mm256_mul_pd(fscal,dx10);
236             ty               = _mm256_mul_pd(fscal,dy10);
237             tz               = _mm256_mul_pd(fscal,dz10);
238
239             /* Update vectorial force */
240             fix1             = _mm256_add_pd(fix1,tx);
241             fiy1             = _mm256_add_pd(fiy1,ty);
242             fiz1             = _mm256_add_pd(fiz1,tz);
243
244             fjx0             = _mm256_add_pd(fjx0,tx);
245             fjy0             = _mm256_add_pd(fjy0,ty);
246             fjz0             = _mm256_add_pd(fjz0,tz);
247
248             /**************************
249              * CALCULATE INTERACTIONS *
250              **************************/
251
252             /* Compute parameters for interactions between i and j atoms */
253             qq20             = _mm256_mul_pd(iq2,jq0);
254
255             /* COULOMB ELECTROSTATICS */
256             velec            = _mm256_mul_pd(qq20,rinv20);
257             felec            = _mm256_mul_pd(velec,rinvsq20);
258
259             /* Update potential sum for this i atom from the interaction with this j atom. */
260             velecsum         = _mm256_add_pd(velecsum,velec);
261
262             fscal            = felec;
263
264             /* Calculate temporary vectorial force */
265             tx               = _mm256_mul_pd(fscal,dx20);
266             ty               = _mm256_mul_pd(fscal,dy20);
267             tz               = _mm256_mul_pd(fscal,dz20);
268
269             /* Update vectorial force */
270             fix2             = _mm256_add_pd(fix2,tx);
271             fiy2             = _mm256_add_pd(fiy2,ty);
272             fiz2             = _mm256_add_pd(fiz2,tz);
273
274             fjx0             = _mm256_add_pd(fjx0,tx);
275             fjy0             = _mm256_add_pd(fjy0,ty);
276             fjz0             = _mm256_add_pd(fjz0,tz);
277
278             /**************************
279              * CALCULATE INTERACTIONS *
280              **************************/
281
282             /* Compute parameters for interactions between i and j atoms */
283             qq30             = _mm256_mul_pd(iq3,jq0);
284
285             /* COULOMB ELECTROSTATICS */
286             velec            = _mm256_mul_pd(qq30,rinv30);
287             felec            = _mm256_mul_pd(velec,rinvsq30);
288
289             /* Update potential sum for this i atom from the interaction with this j atom. */
290             velecsum         = _mm256_add_pd(velecsum,velec);
291
292             fscal            = felec;
293
294             /* Calculate temporary vectorial force */
295             tx               = _mm256_mul_pd(fscal,dx30);
296             ty               = _mm256_mul_pd(fscal,dy30);
297             tz               = _mm256_mul_pd(fscal,dz30);
298
299             /* Update vectorial force */
300             fix3             = _mm256_add_pd(fix3,tx);
301             fiy3             = _mm256_add_pd(fiy3,ty);
302             fiz3             = _mm256_add_pd(fiz3,tz);
303
304             fjx0             = _mm256_add_pd(fjx0,tx);
305             fjy0             = _mm256_add_pd(fjy0,ty);
306             fjz0             = _mm256_add_pd(fjz0,tz);
307
308             fjptrA             = f+j_coord_offsetA;
309             fjptrB             = f+j_coord_offsetB;
310             fjptrC             = f+j_coord_offsetC;
311             fjptrD             = f+j_coord_offsetD;
312
313             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
314
315             /* Inner loop uses 84 flops */
316         }
317
318         if(jidx<j_index_end)
319         {
320
321             /* Get j neighbor index, and coordinate index */
322             jnrlistA         = jjnr[jidx];
323             jnrlistB         = jjnr[jidx+1];
324             jnrlistC         = jjnr[jidx+2];
325             jnrlistD         = jjnr[jidx+3];
326             /* Sign of each element will be negative for non-real atoms.
327              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
328              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
329              */
330             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
331
332             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
333             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
334             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
335
336             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
337             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
338             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
339             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
340             j_coord_offsetA  = DIM*jnrA;
341             j_coord_offsetB  = DIM*jnrB;
342             j_coord_offsetC  = DIM*jnrC;
343             j_coord_offsetD  = DIM*jnrD;
344
345             /* load j atom coordinates */
346             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
347                                                  x+j_coord_offsetC,x+j_coord_offsetD,
348                                                  &jx0,&jy0,&jz0);
349
350             /* Calculate displacement vector */
351             dx10             = _mm256_sub_pd(ix1,jx0);
352             dy10             = _mm256_sub_pd(iy1,jy0);
353             dz10             = _mm256_sub_pd(iz1,jz0);
354             dx20             = _mm256_sub_pd(ix2,jx0);
355             dy20             = _mm256_sub_pd(iy2,jy0);
356             dz20             = _mm256_sub_pd(iz2,jz0);
357             dx30             = _mm256_sub_pd(ix3,jx0);
358             dy30             = _mm256_sub_pd(iy3,jy0);
359             dz30             = _mm256_sub_pd(iz3,jz0);
360
361             /* Calculate squared distance and things based on it */
362             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
363             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
364             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
365
366             rinv10           = avx256_invsqrt_d(rsq10);
367             rinv20           = avx256_invsqrt_d(rsq20);
368             rinv30           = avx256_invsqrt_d(rsq30);
369
370             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
371             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
372             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
373
374             /* Load parameters for j particles */
375             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
376                                                                  charge+jnrC+0,charge+jnrD+0);
377
378             fjx0             = _mm256_setzero_pd();
379             fjy0             = _mm256_setzero_pd();
380             fjz0             = _mm256_setzero_pd();
381
382             /**************************
383              * CALCULATE INTERACTIONS *
384              **************************/
385
386             /* Compute parameters for interactions between i and j atoms */
387             qq10             = _mm256_mul_pd(iq1,jq0);
388
389             /* COULOMB ELECTROSTATICS */
390             velec            = _mm256_mul_pd(qq10,rinv10);
391             felec            = _mm256_mul_pd(velec,rinvsq10);
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velec            = _mm256_andnot_pd(dummy_mask,velec);
395             velecsum         = _mm256_add_pd(velecsum,velec);
396
397             fscal            = felec;
398
399             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
400
401             /* Calculate temporary vectorial force */
402             tx               = _mm256_mul_pd(fscal,dx10);
403             ty               = _mm256_mul_pd(fscal,dy10);
404             tz               = _mm256_mul_pd(fscal,dz10);
405
406             /* Update vectorial force */
407             fix1             = _mm256_add_pd(fix1,tx);
408             fiy1             = _mm256_add_pd(fiy1,ty);
409             fiz1             = _mm256_add_pd(fiz1,tz);
410
411             fjx0             = _mm256_add_pd(fjx0,tx);
412             fjy0             = _mm256_add_pd(fjy0,ty);
413             fjz0             = _mm256_add_pd(fjz0,tz);
414
415             /**************************
416              * CALCULATE INTERACTIONS *
417              **************************/
418
419             /* Compute parameters for interactions between i and j atoms */
420             qq20             = _mm256_mul_pd(iq2,jq0);
421
422             /* COULOMB ELECTROSTATICS */
423             velec            = _mm256_mul_pd(qq20,rinv20);
424             felec            = _mm256_mul_pd(velec,rinvsq20);
425
426             /* Update potential sum for this i atom from the interaction with this j atom. */
427             velec            = _mm256_andnot_pd(dummy_mask,velec);
428             velecsum         = _mm256_add_pd(velecsum,velec);
429
430             fscal            = felec;
431
432             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
433
434             /* Calculate temporary vectorial force */
435             tx               = _mm256_mul_pd(fscal,dx20);
436             ty               = _mm256_mul_pd(fscal,dy20);
437             tz               = _mm256_mul_pd(fscal,dz20);
438
439             /* Update vectorial force */
440             fix2             = _mm256_add_pd(fix2,tx);
441             fiy2             = _mm256_add_pd(fiy2,ty);
442             fiz2             = _mm256_add_pd(fiz2,tz);
443
444             fjx0             = _mm256_add_pd(fjx0,tx);
445             fjy0             = _mm256_add_pd(fjy0,ty);
446             fjz0             = _mm256_add_pd(fjz0,tz);
447
448             /**************************
449              * CALCULATE INTERACTIONS *
450              **************************/
451
452             /* Compute parameters for interactions between i and j atoms */
453             qq30             = _mm256_mul_pd(iq3,jq0);
454
455             /* COULOMB ELECTROSTATICS */
456             velec            = _mm256_mul_pd(qq30,rinv30);
457             felec            = _mm256_mul_pd(velec,rinvsq30);
458
459             /* Update potential sum for this i atom from the interaction with this j atom. */
460             velec            = _mm256_andnot_pd(dummy_mask,velec);
461             velecsum         = _mm256_add_pd(velecsum,velec);
462
463             fscal            = felec;
464
465             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
466
467             /* Calculate temporary vectorial force */
468             tx               = _mm256_mul_pd(fscal,dx30);
469             ty               = _mm256_mul_pd(fscal,dy30);
470             tz               = _mm256_mul_pd(fscal,dz30);
471
472             /* Update vectorial force */
473             fix3             = _mm256_add_pd(fix3,tx);
474             fiy3             = _mm256_add_pd(fiy3,ty);
475             fiz3             = _mm256_add_pd(fiz3,tz);
476
477             fjx0             = _mm256_add_pd(fjx0,tx);
478             fjy0             = _mm256_add_pd(fjy0,ty);
479             fjz0             = _mm256_add_pd(fjz0,tz);
480
481             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
482             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
483             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
484             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
485
486             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
487
488             /* Inner loop uses 84 flops */
489         }
490
491         /* End of innermost loop */
492
493         gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
494                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
495
496         ggid                        = gid[iidx];
497         /* Update potential energies */
498         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
499
500         /* Increment number of inner iterations */
501         inneriter                  += j_index_end - j_index_start;
502
503         /* Outer loop uses 19 flops */
504     }
505
506     /* Increment number of outer iterations */
507     outeriter        += nri;
508
509     /* Update outer/inner flops */
510
511     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*84);
512 }
513 /*
514  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_double
515  * Electrostatics interaction: Coulomb
516  * VdW interaction:            None
517  * Geometry:                   Water4-Particle
518  * Calculate force/pot:        Force
519  */
520 void
521 nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_avx_256_double
522                     (t_nblist                    * gmx_restrict       nlist,
523                      rvec                        * gmx_restrict          xx,
524                      rvec                        * gmx_restrict          ff,
525                      struct t_forcerec           * gmx_restrict          fr,
526                      t_mdatoms                   * gmx_restrict     mdatoms,
527                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
528                      t_nrnb                      * gmx_restrict        nrnb)
529 {
530     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
531      * just 0 for non-waters.
532      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
533      * jnr indices corresponding to data put in the four positions in the SIMD register.
534      */
535     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
536     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
537     int              jnrA,jnrB,jnrC,jnrD;
538     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
539     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
540     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
541     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
542     real             rcutoff_scalar;
543     real             *shiftvec,*fshift,*x,*f;
544     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
545     real             scratch[4*DIM];
546     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
547     real *           vdwioffsetptr1;
548     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
549     real *           vdwioffsetptr2;
550     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
551     real *           vdwioffsetptr3;
552     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
553     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
554     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
555     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
556     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
557     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
558     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
559     real             *charge;
560     __m256d          dummy_mask,cutoff_mask;
561     __m128           tmpmask0,tmpmask1;
562     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
563     __m256d          one     = _mm256_set1_pd(1.0);
564     __m256d          two     = _mm256_set1_pd(2.0);
565     x                = xx[0];
566     f                = ff[0];
567
568     nri              = nlist->nri;
569     iinr             = nlist->iinr;
570     jindex           = nlist->jindex;
571     jjnr             = nlist->jjnr;
572     shiftidx         = nlist->shift;
573     gid              = nlist->gid;
574     shiftvec         = fr->shift_vec[0];
575     fshift           = fr->fshift[0];
576     facel            = _mm256_set1_pd(fr->ic->epsfac);
577     charge           = mdatoms->chargeA;
578
579     /* Setup water-specific parameters */
580     inr              = nlist->iinr[0];
581     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
582     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
583     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
584
585     /* Avoid stupid compiler warnings */
586     jnrA = jnrB = jnrC = jnrD = 0;
587     j_coord_offsetA = 0;
588     j_coord_offsetB = 0;
589     j_coord_offsetC = 0;
590     j_coord_offsetD = 0;
591
592     outeriter        = 0;
593     inneriter        = 0;
594
595     for(iidx=0;iidx<4*DIM;iidx++)
596     {
597         scratch[iidx] = 0.0;
598     }
599
600     /* Start outer loop over neighborlists */
601     for(iidx=0; iidx<nri; iidx++)
602     {
603         /* Load shift vector for this list */
604         i_shift_offset   = DIM*shiftidx[iidx];
605
606         /* Load limits for loop over neighbors */
607         j_index_start    = jindex[iidx];
608         j_index_end      = jindex[iidx+1];
609
610         /* Get outer coordinate index */
611         inr              = iinr[iidx];
612         i_coord_offset   = DIM*inr;
613
614         /* Load i particle coords and add shift vector */
615         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
616                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
617
618         fix1             = _mm256_setzero_pd();
619         fiy1             = _mm256_setzero_pd();
620         fiz1             = _mm256_setzero_pd();
621         fix2             = _mm256_setzero_pd();
622         fiy2             = _mm256_setzero_pd();
623         fiz2             = _mm256_setzero_pd();
624         fix3             = _mm256_setzero_pd();
625         fiy3             = _mm256_setzero_pd();
626         fiz3             = _mm256_setzero_pd();
627
628         /* Start inner kernel loop */
629         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
630         {
631
632             /* Get j neighbor index, and coordinate index */
633             jnrA             = jjnr[jidx];
634             jnrB             = jjnr[jidx+1];
635             jnrC             = jjnr[jidx+2];
636             jnrD             = jjnr[jidx+3];
637             j_coord_offsetA  = DIM*jnrA;
638             j_coord_offsetB  = DIM*jnrB;
639             j_coord_offsetC  = DIM*jnrC;
640             j_coord_offsetD  = DIM*jnrD;
641
642             /* load j atom coordinates */
643             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
644                                                  x+j_coord_offsetC,x+j_coord_offsetD,
645                                                  &jx0,&jy0,&jz0);
646
647             /* Calculate displacement vector */
648             dx10             = _mm256_sub_pd(ix1,jx0);
649             dy10             = _mm256_sub_pd(iy1,jy0);
650             dz10             = _mm256_sub_pd(iz1,jz0);
651             dx20             = _mm256_sub_pd(ix2,jx0);
652             dy20             = _mm256_sub_pd(iy2,jy0);
653             dz20             = _mm256_sub_pd(iz2,jz0);
654             dx30             = _mm256_sub_pd(ix3,jx0);
655             dy30             = _mm256_sub_pd(iy3,jy0);
656             dz30             = _mm256_sub_pd(iz3,jz0);
657
658             /* Calculate squared distance and things based on it */
659             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
660             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
661             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
662
663             rinv10           = avx256_invsqrt_d(rsq10);
664             rinv20           = avx256_invsqrt_d(rsq20);
665             rinv30           = avx256_invsqrt_d(rsq30);
666
667             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
668             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
669             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
670
671             /* Load parameters for j particles */
672             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
673                                                                  charge+jnrC+0,charge+jnrD+0);
674
675             fjx0             = _mm256_setzero_pd();
676             fjy0             = _mm256_setzero_pd();
677             fjz0             = _mm256_setzero_pd();
678
679             /**************************
680              * CALCULATE INTERACTIONS *
681              **************************/
682
683             /* Compute parameters for interactions between i and j atoms */
684             qq10             = _mm256_mul_pd(iq1,jq0);
685
686             /* COULOMB ELECTROSTATICS */
687             velec            = _mm256_mul_pd(qq10,rinv10);
688             felec            = _mm256_mul_pd(velec,rinvsq10);
689
690             fscal            = felec;
691
692             /* Calculate temporary vectorial force */
693             tx               = _mm256_mul_pd(fscal,dx10);
694             ty               = _mm256_mul_pd(fscal,dy10);
695             tz               = _mm256_mul_pd(fscal,dz10);
696
697             /* Update vectorial force */
698             fix1             = _mm256_add_pd(fix1,tx);
699             fiy1             = _mm256_add_pd(fiy1,ty);
700             fiz1             = _mm256_add_pd(fiz1,tz);
701
702             fjx0             = _mm256_add_pd(fjx0,tx);
703             fjy0             = _mm256_add_pd(fjy0,ty);
704             fjz0             = _mm256_add_pd(fjz0,tz);
705
706             /**************************
707              * CALCULATE INTERACTIONS *
708              **************************/
709
710             /* Compute parameters for interactions between i and j atoms */
711             qq20             = _mm256_mul_pd(iq2,jq0);
712
713             /* COULOMB ELECTROSTATICS */
714             velec            = _mm256_mul_pd(qq20,rinv20);
715             felec            = _mm256_mul_pd(velec,rinvsq20);
716
717             fscal            = felec;
718
719             /* Calculate temporary vectorial force */
720             tx               = _mm256_mul_pd(fscal,dx20);
721             ty               = _mm256_mul_pd(fscal,dy20);
722             tz               = _mm256_mul_pd(fscal,dz20);
723
724             /* Update vectorial force */
725             fix2             = _mm256_add_pd(fix2,tx);
726             fiy2             = _mm256_add_pd(fiy2,ty);
727             fiz2             = _mm256_add_pd(fiz2,tz);
728
729             fjx0             = _mm256_add_pd(fjx0,tx);
730             fjy0             = _mm256_add_pd(fjy0,ty);
731             fjz0             = _mm256_add_pd(fjz0,tz);
732
733             /**************************
734              * CALCULATE INTERACTIONS *
735              **************************/
736
737             /* Compute parameters for interactions between i and j atoms */
738             qq30             = _mm256_mul_pd(iq3,jq0);
739
740             /* COULOMB ELECTROSTATICS */
741             velec            = _mm256_mul_pd(qq30,rinv30);
742             felec            = _mm256_mul_pd(velec,rinvsq30);
743
744             fscal            = felec;
745
746             /* Calculate temporary vectorial force */
747             tx               = _mm256_mul_pd(fscal,dx30);
748             ty               = _mm256_mul_pd(fscal,dy30);
749             tz               = _mm256_mul_pd(fscal,dz30);
750
751             /* Update vectorial force */
752             fix3             = _mm256_add_pd(fix3,tx);
753             fiy3             = _mm256_add_pd(fiy3,ty);
754             fiz3             = _mm256_add_pd(fiz3,tz);
755
756             fjx0             = _mm256_add_pd(fjx0,tx);
757             fjy0             = _mm256_add_pd(fjy0,ty);
758             fjz0             = _mm256_add_pd(fjz0,tz);
759
760             fjptrA             = f+j_coord_offsetA;
761             fjptrB             = f+j_coord_offsetB;
762             fjptrC             = f+j_coord_offsetC;
763             fjptrD             = f+j_coord_offsetD;
764
765             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
766
767             /* Inner loop uses 81 flops */
768         }
769
770         if(jidx<j_index_end)
771         {
772
773             /* Get j neighbor index, and coordinate index */
774             jnrlistA         = jjnr[jidx];
775             jnrlistB         = jjnr[jidx+1];
776             jnrlistC         = jjnr[jidx+2];
777             jnrlistD         = jjnr[jidx+3];
778             /* Sign of each element will be negative for non-real atoms.
779              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
780              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
781              */
782             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
783
784             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
785             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
786             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
787
788             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
789             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
790             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
791             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
792             j_coord_offsetA  = DIM*jnrA;
793             j_coord_offsetB  = DIM*jnrB;
794             j_coord_offsetC  = DIM*jnrC;
795             j_coord_offsetD  = DIM*jnrD;
796
797             /* load j atom coordinates */
798             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
799                                                  x+j_coord_offsetC,x+j_coord_offsetD,
800                                                  &jx0,&jy0,&jz0);
801
802             /* Calculate displacement vector */
803             dx10             = _mm256_sub_pd(ix1,jx0);
804             dy10             = _mm256_sub_pd(iy1,jy0);
805             dz10             = _mm256_sub_pd(iz1,jz0);
806             dx20             = _mm256_sub_pd(ix2,jx0);
807             dy20             = _mm256_sub_pd(iy2,jy0);
808             dz20             = _mm256_sub_pd(iz2,jz0);
809             dx30             = _mm256_sub_pd(ix3,jx0);
810             dy30             = _mm256_sub_pd(iy3,jy0);
811             dz30             = _mm256_sub_pd(iz3,jz0);
812
813             /* Calculate squared distance and things based on it */
814             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
815             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
816             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
817
818             rinv10           = avx256_invsqrt_d(rsq10);
819             rinv20           = avx256_invsqrt_d(rsq20);
820             rinv30           = avx256_invsqrt_d(rsq30);
821
822             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
823             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
824             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
825
826             /* Load parameters for j particles */
827             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
828                                                                  charge+jnrC+0,charge+jnrD+0);
829
830             fjx0             = _mm256_setzero_pd();
831             fjy0             = _mm256_setzero_pd();
832             fjz0             = _mm256_setzero_pd();
833
834             /**************************
835              * CALCULATE INTERACTIONS *
836              **************************/
837
838             /* Compute parameters for interactions between i and j atoms */
839             qq10             = _mm256_mul_pd(iq1,jq0);
840
841             /* COULOMB ELECTROSTATICS */
842             velec            = _mm256_mul_pd(qq10,rinv10);
843             felec            = _mm256_mul_pd(velec,rinvsq10);
844
845             fscal            = felec;
846
847             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
848
849             /* Calculate temporary vectorial force */
850             tx               = _mm256_mul_pd(fscal,dx10);
851             ty               = _mm256_mul_pd(fscal,dy10);
852             tz               = _mm256_mul_pd(fscal,dz10);
853
854             /* Update vectorial force */
855             fix1             = _mm256_add_pd(fix1,tx);
856             fiy1             = _mm256_add_pd(fiy1,ty);
857             fiz1             = _mm256_add_pd(fiz1,tz);
858
859             fjx0             = _mm256_add_pd(fjx0,tx);
860             fjy0             = _mm256_add_pd(fjy0,ty);
861             fjz0             = _mm256_add_pd(fjz0,tz);
862
863             /**************************
864              * CALCULATE INTERACTIONS *
865              **************************/
866
867             /* Compute parameters for interactions between i and j atoms */
868             qq20             = _mm256_mul_pd(iq2,jq0);
869
870             /* COULOMB ELECTROSTATICS */
871             velec            = _mm256_mul_pd(qq20,rinv20);
872             felec            = _mm256_mul_pd(velec,rinvsq20);
873
874             fscal            = felec;
875
876             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
877
878             /* Calculate temporary vectorial force */
879             tx               = _mm256_mul_pd(fscal,dx20);
880             ty               = _mm256_mul_pd(fscal,dy20);
881             tz               = _mm256_mul_pd(fscal,dz20);
882
883             /* Update vectorial force */
884             fix2             = _mm256_add_pd(fix2,tx);
885             fiy2             = _mm256_add_pd(fiy2,ty);
886             fiz2             = _mm256_add_pd(fiz2,tz);
887
888             fjx0             = _mm256_add_pd(fjx0,tx);
889             fjy0             = _mm256_add_pd(fjy0,ty);
890             fjz0             = _mm256_add_pd(fjz0,tz);
891
892             /**************************
893              * CALCULATE INTERACTIONS *
894              **************************/
895
896             /* Compute parameters for interactions between i and j atoms */
897             qq30             = _mm256_mul_pd(iq3,jq0);
898
899             /* COULOMB ELECTROSTATICS */
900             velec            = _mm256_mul_pd(qq30,rinv30);
901             felec            = _mm256_mul_pd(velec,rinvsq30);
902
903             fscal            = felec;
904
905             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
906
907             /* Calculate temporary vectorial force */
908             tx               = _mm256_mul_pd(fscal,dx30);
909             ty               = _mm256_mul_pd(fscal,dy30);
910             tz               = _mm256_mul_pd(fscal,dz30);
911
912             /* Update vectorial force */
913             fix3             = _mm256_add_pd(fix3,tx);
914             fiy3             = _mm256_add_pd(fiy3,ty);
915             fiz3             = _mm256_add_pd(fiz3,tz);
916
917             fjx0             = _mm256_add_pd(fjx0,tx);
918             fjy0             = _mm256_add_pd(fjy0,ty);
919             fjz0             = _mm256_add_pd(fjz0,tz);
920
921             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
922             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
923             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
924             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
925
926             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
927
928             /* Inner loop uses 81 flops */
929         }
930
931         /* End of innermost loop */
932
933         gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
934                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
935
936         /* Increment number of inner iterations */
937         inneriter                  += j_index_end - j_index_start;
938
939         /* Outer loop uses 18 flops */
940     }
941
942     /* Increment number of outer iterations */
943     outeriter        += nri;
944
945     /* Update outer/inner flops */
946
947     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*81);
948 }