Use full path for legacyheaders
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRF_VdwLJ_GeomW4P1_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_single
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Particle
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99     __m256           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
100     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
101     real             *charge;
102     int              nvdwtype;
103     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104     int              *vdwtype;
105     real             *vdwparam;
106     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
107     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
108     __m256           dummy_mask,cutoff_mask;
109     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
110     __m256           one     = _mm256_set1_ps(1.0);
111     __m256           two     = _mm256_set1_ps(2.0);
112     x                = xx[0];
113     f                = ff[0];
114
115     nri              = nlist->nri;
116     iinr             = nlist->iinr;
117     jindex           = nlist->jindex;
118     jjnr             = nlist->jjnr;
119     shiftidx         = nlist->shift;
120     gid              = nlist->gid;
121     shiftvec         = fr->shift_vec[0];
122     fshift           = fr->fshift[0];
123     facel            = _mm256_set1_ps(fr->epsfac);
124     charge           = mdatoms->chargeA;
125     krf              = _mm256_set1_ps(fr->ic->k_rf);
126     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
127     crf              = _mm256_set1_ps(fr->ic->c_rf);
128     nvdwtype         = fr->ntype;
129     vdwparam         = fr->nbfp;
130     vdwtype          = mdatoms->typeA;
131
132     /* Setup water-specific parameters */
133     inr              = nlist->iinr[0];
134     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
135     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
136     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
137     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
138
139     /* Avoid stupid compiler warnings */
140     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
141     j_coord_offsetA = 0;
142     j_coord_offsetB = 0;
143     j_coord_offsetC = 0;
144     j_coord_offsetD = 0;
145     j_coord_offsetE = 0;
146     j_coord_offsetF = 0;
147     j_coord_offsetG = 0;
148     j_coord_offsetH = 0;
149
150     outeriter        = 0;
151     inneriter        = 0;
152
153     for(iidx=0;iidx<4*DIM;iidx++)
154     {
155         scratch[iidx] = 0.0;
156     }
157
158     /* Start outer loop over neighborlists */
159     for(iidx=0; iidx<nri; iidx++)
160     {
161         /* Load shift vector for this list */
162         i_shift_offset   = DIM*shiftidx[iidx];
163
164         /* Load limits for loop over neighbors */
165         j_index_start    = jindex[iidx];
166         j_index_end      = jindex[iidx+1];
167
168         /* Get outer coordinate index */
169         inr              = iinr[iidx];
170         i_coord_offset   = DIM*inr;
171
172         /* Load i particle coords and add shift vector */
173         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
174                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
175
176         fix0             = _mm256_setzero_ps();
177         fiy0             = _mm256_setzero_ps();
178         fiz0             = _mm256_setzero_ps();
179         fix1             = _mm256_setzero_ps();
180         fiy1             = _mm256_setzero_ps();
181         fiz1             = _mm256_setzero_ps();
182         fix2             = _mm256_setzero_ps();
183         fiy2             = _mm256_setzero_ps();
184         fiz2             = _mm256_setzero_ps();
185         fix3             = _mm256_setzero_ps();
186         fiy3             = _mm256_setzero_ps();
187         fiz3             = _mm256_setzero_ps();
188
189         /* Reset potential sums */
190         velecsum         = _mm256_setzero_ps();
191         vvdwsum          = _mm256_setzero_ps();
192
193         /* Start inner kernel loop */
194         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
195         {
196
197             /* Get j neighbor index, and coordinate index */
198             jnrA             = jjnr[jidx];
199             jnrB             = jjnr[jidx+1];
200             jnrC             = jjnr[jidx+2];
201             jnrD             = jjnr[jidx+3];
202             jnrE             = jjnr[jidx+4];
203             jnrF             = jjnr[jidx+5];
204             jnrG             = jjnr[jidx+6];
205             jnrH             = jjnr[jidx+7];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208             j_coord_offsetC  = DIM*jnrC;
209             j_coord_offsetD  = DIM*jnrD;
210             j_coord_offsetE  = DIM*jnrE;
211             j_coord_offsetF  = DIM*jnrF;
212             j_coord_offsetG  = DIM*jnrG;
213             j_coord_offsetH  = DIM*jnrH;
214
215             /* load j atom coordinates */
216             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
217                                                  x+j_coord_offsetC,x+j_coord_offsetD,
218                                                  x+j_coord_offsetE,x+j_coord_offsetF,
219                                                  x+j_coord_offsetG,x+j_coord_offsetH,
220                                                  &jx0,&jy0,&jz0);
221
222             /* Calculate displacement vector */
223             dx00             = _mm256_sub_ps(ix0,jx0);
224             dy00             = _mm256_sub_ps(iy0,jy0);
225             dz00             = _mm256_sub_ps(iz0,jz0);
226             dx10             = _mm256_sub_ps(ix1,jx0);
227             dy10             = _mm256_sub_ps(iy1,jy0);
228             dz10             = _mm256_sub_ps(iz1,jz0);
229             dx20             = _mm256_sub_ps(ix2,jx0);
230             dy20             = _mm256_sub_ps(iy2,jy0);
231             dz20             = _mm256_sub_ps(iz2,jz0);
232             dx30             = _mm256_sub_ps(ix3,jx0);
233             dy30             = _mm256_sub_ps(iy3,jy0);
234             dz30             = _mm256_sub_ps(iz3,jz0);
235
236             /* Calculate squared distance and things based on it */
237             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
238             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
239             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
240             rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
241
242             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
243             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
244             rinv30           = gmx_mm256_invsqrt_ps(rsq30);
245
246             rinvsq00         = gmx_mm256_inv_ps(rsq00);
247             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
248             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
249             rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
250
251             /* Load parameters for j particles */
252             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
253                                                                  charge+jnrC+0,charge+jnrD+0,
254                                                                  charge+jnrE+0,charge+jnrF+0,
255                                                                  charge+jnrG+0,charge+jnrH+0);
256             vdwjidx0A        = 2*vdwtype[jnrA+0];
257             vdwjidx0B        = 2*vdwtype[jnrB+0];
258             vdwjidx0C        = 2*vdwtype[jnrC+0];
259             vdwjidx0D        = 2*vdwtype[jnrD+0];
260             vdwjidx0E        = 2*vdwtype[jnrE+0];
261             vdwjidx0F        = 2*vdwtype[jnrF+0];
262             vdwjidx0G        = 2*vdwtype[jnrG+0];
263             vdwjidx0H        = 2*vdwtype[jnrH+0];
264
265             fjx0             = _mm256_setzero_ps();
266             fjy0             = _mm256_setzero_ps();
267             fjz0             = _mm256_setzero_ps();
268
269             /**************************
270              * CALCULATE INTERACTIONS *
271              **************************/
272
273             /* Compute parameters for interactions between i and j atoms */
274             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
275                                             vdwioffsetptr0+vdwjidx0B,
276                                             vdwioffsetptr0+vdwjidx0C,
277                                             vdwioffsetptr0+vdwjidx0D,
278                                             vdwioffsetptr0+vdwjidx0E,
279                                             vdwioffsetptr0+vdwjidx0F,
280                                             vdwioffsetptr0+vdwjidx0G,
281                                             vdwioffsetptr0+vdwjidx0H,
282                                             &c6_00,&c12_00);
283
284             /* LENNARD-JONES DISPERSION/REPULSION */
285
286             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
287             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
288             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
289             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
290             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
291
292             /* Update potential sum for this i atom from the interaction with this j atom. */
293             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
294
295             fscal            = fvdw;
296
297             /* Calculate temporary vectorial force */
298             tx               = _mm256_mul_ps(fscal,dx00);
299             ty               = _mm256_mul_ps(fscal,dy00);
300             tz               = _mm256_mul_ps(fscal,dz00);
301
302             /* Update vectorial force */
303             fix0             = _mm256_add_ps(fix0,tx);
304             fiy0             = _mm256_add_ps(fiy0,ty);
305             fiz0             = _mm256_add_ps(fiz0,tz);
306
307             fjx0             = _mm256_add_ps(fjx0,tx);
308             fjy0             = _mm256_add_ps(fjy0,ty);
309             fjz0             = _mm256_add_ps(fjz0,tz);
310
311             /**************************
312              * CALCULATE INTERACTIONS *
313              **************************/
314
315             /* Compute parameters for interactions between i and j atoms */
316             qq10             = _mm256_mul_ps(iq1,jq0);
317
318             /* REACTION-FIELD ELECTROSTATICS */
319             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
320             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
321
322             /* Update potential sum for this i atom from the interaction with this j atom. */
323             velecsum         = _mm256_add_ps(velecsum,velec);
324
325             fscal            = felec;
326
327             /* Calculate temporary vectorial force */
328             tx               = _mm256_mul_ps(fscal,dx10);
329             ty               = _mm256_mul_ps(fscal,dy10);
330             tz               = _mm256_mul_ps(fscal,dz10);
331
332             /* Update vectorial force */
333             fix1             = _mm256_add_ps(fix1,tx);
334             fiy1             = _mm256_add_ps(fiy1,ty);
335             fiz1             = _mm256_add_ps(fiz1,tz);
336
337             fjx0             = _mm256_add_ps(fjx0,tx);
338             fjy0             = _mm256_add_ps(fjy0,ty);
339             fjz0             = _mm256_add_ps(fjz0,tz);
340
341             /**************************
342              * CALCULATE INTERACTIONS *
343              **************************/
344
345             /* Compute parameters for interactions between i and j atoms */
346             qq20             = _mm256_mul_ps(iq2,jq0);
347
348             /* REACTION-FIELD ELECTROSTATICS */
349             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
350             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             velecsum         = _mm256_add_ps(velecsum,velec);
354
355             fscal            = felec;
356
357             /* Calculate temporary vectorial force */
358             tx               = _mm256_mul_ps(fscal,dx20);
359             ty               = _mm256_mul_ps(fscal,dy20);
360             tz               = _mm256_mul_ps(fscal,dz20);
361
362             /* Update vectorial force */
363             fix2             = _mm256_add_ps(fix2,tx);
364             fiy2             = _mm256_add_ps(fiy2,ty);
365             fiz2             = _mm256_add_ps(fiz2,tz);
366
367             fjx0             = _mm256_add_ps(fjx0,tx);
368             fjy0             = _mm256_add_ps(fjy0,ty);
369             fjz0             = _mm256_add_ps(fjz0,tz);
370
371             /**************************
372              * CALCULATE INTERACTIONS *
373              **************************/
374
375             /* Compute parameters for interactions between i and j atoms */
376             qq30             = _mm256_mul_ps(iq3,jq0);
377
378             /* REACTION-FIELD ELECTROSTATICS */
379             velec            = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_add_ps(rinv30,_mm256_mul_ps(krf,rsq30)),crf));
380             felec            = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_mul_ps(rinv30,rinvsq30),krf2));
381
382             /* Update potential sum for this i atom from the interaction with this j atom. */
383             velecsum         = _mm256_add_ps(velecsum,velec);
384
385             fscal            = felec;
386
387             /* Calculate temporary vectorial force */
388             tx               = _mm256_mul_ps(fscal,dx30);
389             ty               = _mm256_mul_ps(fscal,dy30);
390             tz               = _mm256_mul_ps(fscal,dz30);
391
392             /* Update vectorial force */
393             fix3             = _mm256_add_ps(fix3,tx);
394             fiy3             = _mm256_add_ps(fiy3,ty);
395             fiz3             = _mm256_add_ps(fiz3,tz);
396
397             fjx0             = _mm256_add_ps(fjx0,tx);
398             fjy0             = _mm256_add_ps(fjy0,ty);
399             fjz0             = _mm256_add_ps(fjz0,tz);
400
401             fjptrA             = f+j_coord_offsetA;
402             fjptrB             = f+j_coord_offsetB;
403             fjptrC             = f+j_coord_offsetC;
404             fjptrD             = f+j_coord_offsetD;
405             fjptrE             = f+j_coord_offsetE;
406             fjptrF             = f+j_coord_offsetF;
407             fjptrG             = f+j_coord_offsetG;
408             fjptrH             = f+j_coord_offsetH;
409
410             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
411
412             /* Inner loop uses 131 flops */
413         }
414
415         if(jidx<j_index_end)
416         {
417
418             /* Get j neighbor index, and coordinate index */
419             jnrlistA         = jjnr[jidx];
420             jnrlistB         = jjnr[jidx+1];
421             jnrlistC         = jjnr[jidx+2];
422             jnrlistD         = jjnr[jidx+3];
423             jnrlistE         = jjnr[jidx+4];
424             jnrlistF         = jjnr[jidx+5];
425             jnrlistG         = jjnr[jidx+6];
426             jnrlistH         = jjnr[jidx+7];
427             /* Sign of each element will be negative for non-real atoms.
428              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
429              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
430              */
431             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
432                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
433                                             
434             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
435             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
436             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
437             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
438             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
439             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
440             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
441             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
442             j_coord_offsetA  = DIM*jnrA;
443             j_coord_offsetB  = DIM*jnrB;
444             j_coord_offsetC  = DIM*jnrC;
445             j_coord_offsetD  = DIM*jnrD;
446             j_coord_offsetE  = DIM*jnrE;
447             j_coord_offsetF  = DIM*jnrF;
448             j_coord_offsetG  = DIM*jnrG;
449             j_coord_offsetH  = DIM*jnrH;
450
451             /* load j atom coordinates */
452             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
453                                                  x+j_coord_offsetC,x+j_coord_offsetD,
454                                                  x+j_coord_offsetE,x+j_coord_offsetF,
455                                                  x+j_coord_offsetG,x+j_coord_offsetH,
456                                                  &jx0,&jy0,&jz0);
457
458             /* Calculate displacement vector */
459             dx00             = _mm256_sub_ps(ix0,jx0);
460             dy00             = _mm256_sub_ps(iy0,jy0);
461             dz00             = _mm256_sub_ps(iz0,jz0);
462             dx10             = _mm256_sub_ps(ix1,jx0);
463             dy10             = _mm256_sub_ps(iy1,jy0);
464             dz10             = _mm256_sub_ps(iz1,jz0);
465             dx20             = _mm256_sub_ps(ix2,jx0);
466             dy20             = _mm256_sub_ps(iy2,jy0);
467             dz20             = _mm256_sub_ps(iz2,jz0);
468             dx30             = _mm256_sub_ps(ix3,jx0);
469             dy30             = _mm256_sub_ps(iy3,jy0);
470             dz30             = _mm256_sub_ps(iz3,jz0);
471
472             /* Calculate squared distance and things based on it */
473             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
474             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
475             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
476             rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
477
478             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
479             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
480             rinv30           = gmx_mm256_invsqrt_ps(rsq30);
481
482             rinvsq00         = gmx_mm256_inv_ps(rsq00);
483             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
484             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
485             rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
486
487             /* Load parameters for j particles */
488             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
489                                                                  charge+jnrC+0,charge+jnrD+0,
490                                                                  charge+jnrE+0,charge+jnrF+0,
491                                                                  charge+jnrG+0,charge+jnrH+0);
492             vdwjidx0A        = 2*vdwtype[jnrA+0];
493             vdwjidx0B        = 2*vdwtype[jnrB+0];
494             vdwjidx0C        = 2*vdwtype[jnrC+0];
495             vdwjidx0D        = 2*vdwtype[jnrD+0];
496             vdwjidx0E        = 2*vdwtype[jnrE+0];
497             vdwjidx0F        = 2*vdwtype[jnrF+0];
498             vdwjidx0G        = 2*vdwtype[jnrG+0];
499             vdwjidx0H        = 2*vdwtype[jnrH+0];
500
501             fjx0             = _mm256_setzero_ps();
502             fjy0             = _mm256_setzero_ps();
503             fjz0             = _mm256_setzero_ps();
504
505             /**************************
506              * CALCULATE INTERACTIONS *
507              **************************/
508
509             /* Compute parameters for interactions between i and j atoms */
510             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
511                                             vdwioffsetptr0+vdwjidx0B,
512                                             vdwioffsetptr0+vdwjidx0C,
513                                             vdwioffsetptr0+vdwjidx0D,
514                                             vdwioffsetptr0+vdwjidx0E,
515                                             vdwioffsetptr0+vdwjidx0F,
516                                             vdwioffsetptr0+vdwjidx0G,
517                                             vdwioffsetptr0+vdwjidx0H,
518                                             &c6_00,&c12_00);
519
520             /* LENNARD-JONES DISPERSION/REPULSION */
521
522             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
523             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
524             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
525             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
526             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
527
528             /* Update potential sum for this i atom from the interaction with this j atom. */
529             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
530             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
531
532             fscal            = fvdw;
533
534             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
535
536             /* Calculate temporary vectorial force */
537             tx               = _mm256_mul_ps(fscal,dx00);
538             ty               = _mm256_mul_ps(fscal,dy00);
539             tz               = _mm256_mul_ps(fscal,dz00);
540
541             /* Update vectorial force */
542             fix0             = _mm256_add_ps(fix0,tx);
543             fiy0             = _mm256_add_ps(fiy0,ty);
544             fiz0             = _mm256_add_ps(fiz0,tz);
545
546             fjx0             = _mm256_add_ps(fjx0,tx);
547             fjy0             = _mm256_add_ps(fjy0,ty);
548             fjz0             = _mm256_add_ps(fjz0,tz);
549
550             /**************************
551              * CALCULATE INTERACTIONS *
552              **************************/
553
554             /* Compute parameters for interactions between i and j atoms */
555             qq10             = _mm256_mul_ps(iq1,jq0);
556
557             /* REACTION-FIELD ELECTROSTATICS */
558             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
559             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
560
561             /* Update potential sum for this i atom from the interaction with this j atom. */
562             velec            = _mm256_andnot_ps(dummy_mask,velec);
563             velecsum         = _mm256_add_ps(velecsum,velec);
564
565             fscal            = felec;
566
567             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
568
569             /* Calculate temporary vectorial force */
570             tx               = _mm256_mul_ps(fscal,dx10);
571             ty               = _mm256_mul_ps(fscal,dy10);
572             tz               = _mm256_mul_ps(fscal,dz10);
573
574             /* Update vectorial force */
575             fix1             = _mm256_add_ps(fix1,tx);
576             fiy1             = _mm256_add_ps(fiy1,ty);
577             fiz1             = _mm256_add_ps(fiz1,tz);
578
579             fjx0             = _mm256_add_ps(fjx0,tx);
580             fjy0             = _mm256_add_ps(fjy0,ty);
581             fjz0             = _mm256_add_ps(fjz0,tz);
582
583             /**************************
584              * CALCULATE INTERACTIONS *
585              **************************/
586
587             /* Compute parameters for interactions between i and j atoms */
588             qq20             = _mm256_mul_ps(iq2,jq0);
589
590             /* REACTION-FIELD ELECTROSTATICS */
591             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
592             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
593
594             /* Update potential sum for this i atom from the interaction with this j atom. */
595             velec            = _mm256_andnot_ps(dummy_mask,velec);
596             velecsum         = _mm256_add_ps(velecsum,velec);
597
598             fscal            = felec;
599
600             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
601
602             /* Calculate temporary vectorial force */
603             tx               = _mm256_mul_ps(fscal,dx20);
604             ty               = _mm256_mul_ps(fscal,dy20);
605             tz               = _mm256_mul_ps(fscal,dz20);
606
607             /* Update vectorial force */
608             fix2             = _mm256_add_ps(fix2,tx);
609             fiy2             = _mm256_add_ps(fiy2,ty);
610             fiz2             = _mm256_add_ps(fiz2,tz);
611
612             fjx0             = _mm256_add_ps(fjx0,tx);
613             fjy0             = _mm256_add_ps(fjy0,ty);
614             fjz0             = _mm256_add_ps(fjz0,tz);
615
616             /**************************
617              * CALCULATE INTERACTIONS *
618              **************************/
619
620             /* Compute parameters for interactions between i and j atoms */
621             qq30             = _mm256_mul_ps(iq3,jq0);
622
623             /* REACTION-FIELD ELECTROSTATICS */
624             velec            = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_add_ps(rinv30,_mm256_mul_ps(krf,rsq30)),crf));
625             felec            = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_mul_ps(rinv30,rinvsq30),krf2));
626
627             /* Update potential sum for this i atom from the interaction with this j atom. */
628             velec            = _mm256_andnot_ps(dummy_mask,velec);
629             velecsum         = _mm256_add_ps(velecsum,velec);
630
631             fscal            = felec;
632
633             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
634
635             /* Calculate temporary vectorial force */
636             tx               = _mm256_mul_ps(fscal,dx30);
637             ty               = _mm256_mul_ps(fscal,dy30);
638             tz               = _mm256_mul_ps(fscal,dz30);
639
640             /* Update vectorial force */
641             fix3             = _mm256_add_ps(fix3,tx);
642             fiy3             = _mm256_add_ps(fiy3,ty);
643             fiz3             = _mm256_add_ps(fiz3,tz);
644
645             fjx0             = _mm256_add_ps(fjx0,tx);
646             fjy0             = _mm256_add_ps(fjy0,ty);
647             fjz0             = _mm256_add_ps(fjz0,tz);
648
649             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
650             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
651             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
652             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
653             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
654             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
655             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
656             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
657
658             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
659
660             /* Inner loop uses 131 flops */
661         }
662
663         /* End of innermost loop */
664
665         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
666                                                  f+i_coord_offset,fshift+i_shift_offset);
667
668         ggid                        = gid[iidx];
669         /* Update potential energies */
670         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
671         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
672
673         /* Increment number of inner iterations */
674         inneriter                  += j_index_end - j_index_start;
675
676         /* Outer loop uses 26 flops */
677     }
678
679     /* Increment number of outer iterations */
680     outeriter        += nri;
681
682     /* Update outer/inner flops */
683
684     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*131);
685 }
686 /*
687  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_single
688  * Electrostatics interaction: ReactionField
689  * VdW interaction:            LennardJones
690  * Geometry:                   Water4-Particle
691  * Calculate force/pot:        Force
692  */
693 void
694 nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_single
695                     (t_nblist                    * gmx_restrict       nlist,
696                      rvec                        * gmx_restrict          xx,
697                      rvec                        * gmx_restrict          ff,
698                      t_forcerec                  * gmx_restrict          fr,
699                      t_mdatoms                   * gmx_restrict     mdatoms,
700                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
701                      t_nrnb                      * gmx_restrict        nrnb)
702 {
703     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
704      * just 0 for non-waters.
705      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
706      * jnr indices corresponding to data put in the four positions in the SIMD register.
707      */
708     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
709     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
710     int              jnrA,jnrB,jnrC,jnrD;
711     int              jnrE,jnrF,jnrG,jnrH;
712     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
713     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
714     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
715     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
716     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
717     real             rcutoff_scalar;
718     real             *shiftvec,*fshift,*x,*f;
719     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
720     real             scratch[4*DIM];
721     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
722     real *           vdwioffsetptr0;
723     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
724     real *           vdwioffsetptr1;
725     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
726     real *           vdwioffsetptr2;
727     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
728     real *           vdwioffsetptr3;
729     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
730     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
731     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
732     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
733     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
734     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
735     __m256           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
736     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
737     real             *charge;
738     int              nvdwtype;
739     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
740     int              *vdwtype;
741     real             *vdwparam;
742     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
743     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
744     __m256           dummy_mask,cutoff_mask;
745     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
746     __m256           one     = _mm256_set1_ps(1.0);
747     __m256           two     = _mm256_set1_ps(2.0);
748     x                = xx[0];
749     f                = ff[0];
750
751     nri              = nlist->nri;
752     iinr             = nlist->iinr;
753     jindex           = nlist->jindex;
754     jjnr             = nlist->jjnr;
755     shiftidx         = nlist->shift;
756     gid              = nlist->gid;
757     shiftvec         = fr->shift_vec[0];
758     fshift           = fr->fshift[0];
759     facel            = _mm256_set1_ps(fr->epsfac);
760     charge           = mdatoms->chargeA;
761     krf              = _mm256_set1_ps(fr->ic->k_rf);
762     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
763     crf              = _mm256_set1_ps(fr->ic->c_rf);
764     nvdwtype         = fr->ntype;
765     vdwparam         = fr->nbfp;
766     vdwtype          = mdatoms->typeA;
767
768     /* Setup water-specific parameters */
769     inr              = nlist->iinr[0];
770     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
771     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
772     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
773     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
774
775     /* Avoid stupid compiler warnings */
776     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
777     j_coord_offsetA = 0;
778     j_coord_offsetB = 0;
779     j_coord_offsetC = 0;
780     j_coord_offsetD = 0;
781     j_coord_offsetE = 0;
782     j_coord_offsetF = 0;
783     j_coord_offsetG = 0;
784     j_coord_offsetH = 0;
785
786     outeriter        = 0;
787     inneriter        = 0;
788
789     for(iidx=0;iidx<4*DIM;iidx++)
790     {
791         scratch[iidx] = 0.0;
792     }
793
794     /* Start outer loop over neighborlists */
795     for(iidx=0; iidx<nri; iidx++)
796     {
797         /* Load shift vector for this list */
798         i_shift_offset   = DIM*shiftidx[iidx];
799
800         /* Load limits for loop over neighbors */
801         j_index_start    = jindex[iidx];
802         j_index_end      = jindex[iidx+1];
803
804         /* Get outer coordinate index */
805         inr              = iinr[iidx];
806         i_coord_offset   = DIM*inr;
807
808         /* Load i particle coords and add shift vector */
809         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
810                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
811
812         fix0             = _mm256_setzero_ps();
813         fiy0             = _mm256_setzero_ps();
814         fiz0             = _mm256_setzero_ps();
815         fix1             = _mm256_setzero_ps();
816         fiy1             = _mm256_setzero_ps();
817         fiz1             = _mm256_setzero_ps();
818         fix2             = _mm256_setzero_ps();
819         fiy2             = _mm256_setzero_ps();
820         fiz2             = _mm256_setzero_ps();
821         fix3             = _mm256_setzero_ps();
822         fiy3             = _mm256_setzero_ps();
823         fiz3             = _mm256_setzero_ps();
824
825         /* Start inner kernel loop */
826         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
827         {
828
829             /* Get j neighbor index, and coordinate index */
830             jnrA             = jjnr[jidx];
831             jnrB             = jjnr[jidx+1];
832             jnrC             = jjnr[jidx+2];
833             jnrD             = jjnr[jidx+3];
834             jnrE             = jjnr[jidx+4];
835             jnrF             = jjnr[jidx+5];
836             jnrG             = jjnr[jidx+6];
837             jnrH             = jjnr[jidx+7];
838             j_coord_offsetA  = DIM*jnrA;
839             j_coord_offsetB  = DIM*jnrB;
840             j_coord_offsetC  = DIM*jnrC;
841             j_coord_offsetD  = DIM*jnrD;
842             j_coord_offsetE  = DIM*jnrE;
843             j_coord_offsetF  = DIM*jnrF;
844             j_coord_offsetG  = DIM*jnrG;
845             j_coord_offsetH  = DIM*jnrH;
846
847             /* load j atom coordinates */
848             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
849                                                  x+j_coord_offsetC,x+j_coord_offsetD,
850                                                  x+j_coord_offsetE,x+j_coord_offsetF,
851                                                  x+j_coord_offsetG,x+j_coord_offsetH,
852                                                  &jx0,&jy0,&jz0);
853
854             /* Calculate displacement vector */
855             dx00             = _mm256_sub_ps(ix0,jx0);
856             dy00             = _mm256_sub_ps(iy0,jy0);
857             dz00             = _mm256_sub_ps(iz0,jz0);
858             dx10             = _mm256_sub_ps(ix1,jx0);
859             dy10             = _mm256_sub_ps(iy1,jy0);
860             dz10             = _mm256_sub_ps(iz1,jz0);
861             dx20             = _mm256_sub_ps(ix2,jx0);
862             dy20             = _mm256_sub_ps(iy2,jy0);
863             dz20             = _mm256_sub_ps(iz2,jz0);
864             dx30             = _mm256_sub_ps(ix3,jx0);
865             dy30             = _mm256_sub_ps(iy3,jy0);
866             dz30             = _mm256_sub_ps(iz3,jz0);
867
868             /* Calculate squared distance and things based on it */
869             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
870             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
871             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
872             rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
873
874             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
875             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
876             rinv30           = gmx_mm256_invsqrt_ps(rsq30);
877
878             rinvsq00         = gmx_mm256_inv_ps(rsq00);
879             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
880             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
881             rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
882
883             /* Load parameters for j particles */
884             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
885                                                                  charge+jnrC+0,charge+jnrD+0,
886                                                                  charge+jnrE+0,charge+jnrF+0,
887                                                                  charge+jnrG+0,charge+jnrH+0);
888             vdwjidx0A        = 2*vdwtype[jnrA+0];
889             vdwjidx0B        = 2*vdwtype[jnrB+0];
890             vdwjidx0C        = 2*vdwtype[jnrC+0];
891             vdwjidx0D        = 2*vdwtype[jnrD+0];
892             vdwjidx0E        = 2*vdwtype[jnrE+0];
893             vdwjidx0F        = 2*vdwtype[jnrF+0];
894             vdwjidx0G        = 2*vdwtype[jnrG+0];
895             vdwjidx0H        = 2*vdwtype[jnrH+0];
896
897             fjx0             = _mm256_setzero_ps();
898             fjy0             = _mm256_setzero_ps();
899             fjz0             = _mm256_setzero_ps();
900
901             /**************************
902              * CALCULATE INTERACTIONS *
903              **************************/
904
905             /* Compute parameters for interactions between i and j atoms */
906             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
907                                             vdwioffsetptr0+vdwjidx0B,
908                                             vdwioffsetptr0+vdwjidx0C,
909                                             vdwioffsetptr0+vdwjidx0D,
910                                             vdwioffsetptr0+vdwjidx0E,
911                                             vdwioffsetptr0+vdwjidx0F,
912                                             vdwioffsetptr0+vdwjidx0G,
913                                             vdwioffsetptr0+vdwjidx0H,
914                                             &c6_00,&c12_00);
915
916             /* LENNARD-JONES DISPERSION/REPULSION */
917
918             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
919             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
920
921             fscal            = fvdw;
922
923             /* Calculate temporary vectorial force */
924             tx               = _mm256_mul_ps(fscal,dx00);
925             ty               = _mm256_mul_ps(fscal,dy00);
926             tz               = _mm256_mul_ps(fscal,dz00);
927
928             /* Update vectorial force */
929             fix0             = _mm256_add_ps(fix0,tx);
930             fiy0             = _mm256_add_ps(fiy0,ty);
931             fiz0             = _mm256_add_ps(fiz0,tz);
932
933             fjx0             = _mm256_add_ps(fjx0,tx);
934             fjy0             = _mm256_add_ps(fjy0,ty);
935             fjz0             = _mm256_add_ps(fjz0,tz);
936
937             /**************************
938              * CALCULATE INTERACTIONS *
939              **************************/
940
941             /* Compute parameters for interactions between i and j atoms */
942             qq10             = _mm256_mul_ps(iq1,jq0);
943
944             /* REACTION-FIELD ELECTROSTATICS */
945             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
946
947             fscal            = felec;
948
949             /* Calculate temporary vectorial force */
950             tx               = _mm256_mul_ps(fscal,dx10);
951             ty               = _mm256_mul_ps(fscal,dy10);
952             tz               = _mm256_mul_ps(fscal,dz10);
953
954             /* Update vectorial force */
955             fix1             = _mm256_add_ps(fix1,tx);
956             fiy1             = _mm256_add_ps(fiy1,ty);
957             fiz1             = _mm256_add_ps(fiz1,tz);
958
959             fjx0             = _mm256_add_ps(fjx0,tx);
960             fjy0             = _mm256_add_ps(fjy0,ty);
961             fjz0             = _mm256_add_ps(fjz0,tz);
962
963             /**************************
964              * CALCULATE INTERACTIONS *
965              **************************/
966
967             /* Compute parameters for interactions between i and j atoms */
968             qq20             = _mm256_mul_ps(iq2,jq0);
969
970             /* REACTION-FIELD ELECTROSTATICS */
971             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
972
973             fscal            = felec;
974
975             /* Calculate temporary vectorial force */
976             tx               = _mm256_mul_ps(fscal,dx20);
977             ty               = _mm256_mul_ps(fscal,dy20);
978             tz               = _mm256_mul_ps(fscal,dz20);
979
980             /* Update vectorial force */
981             fix2             = _mm256_add_ps(fix2,tx);
982             fiy2             = _mm256_add_ps(fiy2,ty);
983             fiz2             = _mm256_add_ps(fiz2,tz);
984
985             fjx0             = _mm256_add_ps(fjx0,tx);
986             fjy0             = _mm256_add_ps(fjy0,ty);
987             fjz0             = _mm256_add_ps(fjz0,tz);
988
989             /**************************
990              * CALCULATE INTERACTIONS *
991              **************************/
992
993             /* Compute parameters for interactions between i and j atoms */
994             qq30             = _mm256_mul_ps(iq3,jq0);
995
996             /* REACTION-FIELD ELECTROSTATICS */
997             felec            = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_mul_ps(rinv30,rinvsq30),krf2));
998
999             fscal            = felec;
1000
1001             /* Calculate temporary vectorial force */
1002             tx               = _mm256_mul_ps(fscal,dx30);
1003             ty               = _mm256_mul_ps(fscal,dy30);
1004             tz               = _mm256_mul_ps(fscal,dz30);
1005
1006             /* Update vectorial force */
1007             fix3             = _mm256_add_ps(fix3,tx);
1008             fiy3             = _mm256_add_ps(fiy3,ty);
1009             fiz3             = _mm256_add_ps(fiz3,tz);
1010
1011             fjx0             = _mm256_add_ps(fjx0,tx);
1012             fjy0             = _mm256_add_ps(fjy0,ty);
1013             fjz0             = _mm256_add_ps(fjz0,tz);
1014
1015             fjptrA             = f+j_coord_offsetA;
1016             fjptrB             = f+j_coord_offsetB;
1017             fjptrC             = f+j_coord_offsetC;
1018             fjptrD             = f+j_coord_offsetD;
1019             fjptrE             = f+j_coord_offsetE;
1020             fjptrF             = f+j_coord_offsetF;
1021             fjptrG             = f+j_coord_offsetG;
1022             fjptrH             = f+j_coord_offsetH;
1023
1024             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1025
1026             /* Inner loop uses 111 flops */
1027         }
1028
1029         if(jidx<j_index_end)
1030         {
1031
1032             /* Get j neighbor index, and coordinate index */
1033             jnrlistA         = jjnr[jidx];
1034             jnrlistB         = jjnr[jidx+1];
1035             jnrlistC         = jjnr[jidx+2];
1036             jnrlistD         = jjnr[jidx+3];
1037             jnrlistE         = jjnr[jidx+4];
1038             jnrlistF         = jjnr[jidx+5];
1039             jnrlistG         = jjnr[jidx+6];
1040             jnrlistH         = jjnr[jidx+7];
1041             /* Sign of each element will be negative for non-real atoms.
1042              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1043              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1044              */
1045             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1046                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1047                                             
1048             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1049             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1050             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1051             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1052             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1053             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1054             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1055             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1056             j_coord_offsetA  = DIM*jnrA;
1057             j_coord_offsetB  = DIM*jnrB;
1058             j_coord_offsetC  = DIM*jnrC;
1059             j_coord_offsetD  = DIM*jnrD;
1060             j_coord_offsetE  = DIM*jnrE;
1061             j_coord_offsetF  = DIM*jnrF;
1062             j_coord_offsetG  = DIM*jnrG;
1063             j_coord_offsetH  = DIM*jnrH;
1064
1065             /* load j atom coordinates */
1066             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1067                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1068                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1069                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1070                                                  &jx0,&jy0,&jz0);
1071
1072             /* Calculate displacement vector */
1073             dx00             = _mm256_sub_ps(ix0,jx0);
1074             dy00             = _mm256_sub_ps(iy0,jy0);
1075             dz00             = _mm256_sub_ps(iz0,jz0);
1076             dx10             = _mm256_sub_ps(ix1,jx0);
1077             dy10             = _mm256_sub_ps(iy1,jy0);
1078             dz10             = _mm256_sub_ps(iz1,jz0);
1079             dx20             = _mm256_sub_ps(ix2,jx0);
1080             dy20             = _mm256_sub_ps(iy2,jy0);
1081             dz20             = _mm256_sub_ps(iz2,jz0);
1082             dx30             = _mm256_sub_ps(ix3,jx0);
1083             dy30             = _mm256_sub_ps(iy3,jy0);
1084             dz30             = _mm256_sub_ps(iz3,jz0);
1085
1086             /* Calculate squared distance and things based on it */
1087             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1088             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1089             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1090             rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1091
1092             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1093             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1094             rinv30           = gmx_mm256_invsqrt_ps(rsq30);
1095
1096             rinvsq00         = gmx_mm256_inv_ps(rsq00);
1097             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1098             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1099             rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
1100
1101             /* Load parameters for j particles */
1102             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1103                                                                  charge+jnrC+0,charge+jnrD+0,
1104                                                                  charge+jnrE+0,charge+jnrF+0,
1105                                                                  charge+jnrG+0,charge+jnrH+0);
1106             vdwjidx0A        = 2*vdwtype[jnrA+0];
1107             vdwjidx0B        = 2*vdwtype[jnrB+0];
1108             vdwjidx0C        = 2*vdwtype[jnrC+0];
1109             vdwjidx0D        = 2*vdwtype[jnrD+0];
1110             vdwjidx0E        = 2*vdwtype[jnrE+0];
1111             vdwjidx0F        = 2*vdwtype[jnrF+0];
1112             vdwjidx0G        = 2*vdwtype[jnrG+0];
1113             vdwjidx0H        = 2*vdwtype[jnrH+0];
1114
1115             fjx0             = _mm256_setzero_ps();
1116             fjy0             = _mm256_setzero_ps();
1117             fjz0             = _mm256_setzero_ps();
1118
1119             /**************************
1120              * CALCULATE INTERACTIONS *
1121              **************************/
1122
1123             /* Compute parameters for interactions between i and j atoms */
1124             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1125                                             vdwioffsetptr0+vdwjidx0B,
1126                                             vdwioffsetptr0+vdwjidx0C,
1127                                             vdwioffsetptr0+vdwjidx0D,
1128                                             vdwioffsetptr0+vdwjidx0E,
1129                                             vdwioffsetptr0+vdwjidx0F,
1130                                             vdwioffsetptr0+vdwjidx0G,
1131                                             vdwioffsetptr0+vdwjidx0H,
1132                                             &c6_00,&c12_00);
1133
1134             /* LENNARD-JONES DISPERSION/REPULSION */
1135
1136             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1137             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1138
1139             fscal            = fvdw;
1140
1141             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1142
1143             /* Calculate temporary vectorial force */
1144             tx               = _mm256_mul_ps(fscal,dx00);
1145             ty               = _mm256_mul_ps(fscal,dy00);
1146             tz               = _mm256_mul_ps(fscal,dz00);
1147
1148             /* Update vectorial force */
1149             fix0             = _mm256_add_ps(fix0,tx);
1150             fiy0             = _mm256_add_ps(fiy0,ty);
1151             fiz0             = _mm256_add_ps(fiz0,tz);
1152
1153             fjx0             = _mm256_add_ps(fjx0,tx);
1154             fjy0             = _mm256_add_ps(fjy0,ty);
1155             fjz0             = _mm256_add_ps(fjz0,tz);
1156
1157             /**************************
1158              * CALCULATE INTERACTIONS *
1159              **************************/
1160
1161             /* Compute parameters for interactions between i and j atoms */
1162             qq10             = _mm256_mul_ps(iq1,jq0);
1163
1164             /* REACTION-FIELD ELECTROSTATICS */
1165             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1166
1167             fscal            = felec;
1168
1169             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1170
1171             /* Calculate temporary vectorial force */
1172             tx               = _mm256_mul_ps(fscal,dx10);
1173             ty               = _mm256_mul_ps(fscal,dy10);
1174             tz               = _mm256_mul_ps(fscal,dz10);
1175
1176             /* Update vectorial force */
1177             fix1             = _mm256_add_ps(fix1,tx);
1178             fiy1             = _mm256_add_ps(fiy1,ty);
1179             fiz1             = _mm256_add_ps(fiz1,tz);
1180
1181             fjx0             = _mm256_add_ps(fjx0,tx);
1182             fjy0             = _mm256_add_ps(fjy0,ty);
1183             fjz0             = _mm256_add_ps(fjz0,tz);
1184
1185             /**************************
1186              * CALCULATE INTERACTIONS *
1187              **************************/
1188
1189             /* Compute parameters for interactions between i and j atoms */
1190             qq20             = _mm256_mul_ps(iq2,jq0);
1191
1192             /* REACTION-FIELD ELECTROSTATICS */
1193             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1194
1195             fscal            = felec;
1196
1197             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1198
1199             /* Calculate temporary vectorial force */
1200             tx               = _mm256_mul_ps(fscal,dx20);
1201             ty               = _mm256_mul_ps(fscal,dy20);
1202             tz               = _mm256_mul_ps(fscal,dz20);
1203
1204             /* Update vectorial force */
1205             fix2             = _mm256_add_ps(fix2,tx);
1206             fiy2             = _mm256_add_ps(fiy2,ty);
1207             fiz2             = _mm256_add_ps(fiz2,tz);
1208
1209             fjx0             = _mm256_add_ps(fjx0,tx);
1210             fjy0             = _mm256_add_ps(fjy0,ty);
1211             fjz0             = _mm256_add_ps(fjz0,tz);
1212
1213             /**************************
1214              * CALCULATE INTERACTIONS *
1215              **************************/
1216
1217             /* Compute parameters for interactions between i and j atoms */
1218             qq30             = _mm256_mul_ps(iq3,jq0);
1219
1220             /* REACTION-FIELD ELECTROSTATICS */
1221             felec            = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_mul_ps(rinv30,rinvsq30),krf2));
1222
1223             fscal            = felec;
1224
1225             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1226
1227             /* Calculate temporary vectorial force */
1228             tx               = _mm256_mul_ps(fscal,dx30);
1229             ty               = _mm256_mul_ps(fscal,dy30);
1230             tz               = _mm256_mul_ps(fscal,dz30);
1231
1232             /* Update vectorial force */
1233             fix3             = _mm256_add_ps(fix3,tx);
1234             fiy3             = _mm256_add_ps(fiy3,ty);
1235             fiz3             = _mm256_add_ps(fiz3,tz);
1236
1237             fjx0             = _mm256_add_ps(fjx0,tx);
1238             fjy0             = _mm256_add_ps(fjy0,ty);
1239             fjz0             = _mm256_add_ps(fjz0,tz);
1240
1241             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1242             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1243             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1244             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1245             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1246             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1247             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1248             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1249
1250             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1251
1252             /* Inner loop uses 111 flops */
1253         }
1254
1255         /* End of innermost loop */
1256
1257         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1258                                                  f+i_coord_offset,fshift+i_shift_offset);
1259
1260         /* Increment number of inner iterations */
1261         inneriter                  += j_index_end - j_index_start;
1262
1263         /* Outer loop uses 24 flops */
1264     }
1265
1266     /* Increment number of outer iterations */
1267     outeriter        += nri;
1268
1269     /* Update outer/inner flops */
1270
1271     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*111);
1272 }