Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRFCut_VdwNone_GeomW4W4_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            None
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrE,jnrF,jnrG,jnrH;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
83     real             scratch[4*DIM];
84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     real *           vdwioffsetptr1;
86     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
92     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
94     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
96     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m256           dummy_mask,cutoff_mask;
109     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
110     __m256           one     = _mm256_set1_ps(1.0);
111     __m256           two     = _mm256_set1_ps(2.0);
112     x                = xx[0];
113     f                = ff[0];
114
115     nri              = nlist->nri;
116     iinr             = nlist->iinr;
117     jindex           = nlist->jindex;
118     jjnr             = nlist->jjnr;
119     shiftidx         = nlist->shift;
120     gid              = nlist->gid;
121     shiftvec         = fr->shift_vec[0];
122     fshift           = fr->fshift[0];
123     facel            = _mm256_set1_ps(fr->ic->epsfac);
124     charge           = mdatoms->chargeA;
125     krf              = _mm256_set1_ps(fr->ic->k_rf);
126     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
127     crf              = _mm256_set1_ps(fr->ic->c_rf);
128
129     /* Setup water-specific parameters */
130     inr              = nlist->iinr[0];
131     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
132     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
133     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
134
135     jq1              = _mm256_set1_ps(charge[inr+1]);
136     jq2              = _mm256_set1_ps(charge[inr+2]);
137     jq3              = _mm256_set1_ps(charge[inr+3]);
138     qq11             = _mm256_mul_ps(iq1,jq1);
139     qq12             = _mm256_mul_ps(iq1,jq2);
140     qq13             = _mm256_mul_ps(iq1,jq3);
141     qq21             = _mm256_mul_ps(iq2,jq1);
142     qq22             = _mm256_mul_ps(iq2,jq2);
143     qq23             = _mm256_mul_ps(iq2,jq3);
144     qq31             = _mm256_mul_ps(iq3,jq1);
145     qq32             = _mm256_mul_ps(iq3,jq2);
146     qq33             = _mm256_mul_ps(iq3,jq3);
147
148     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
149     rcutoff_scalar   = fr->ic->rcoulomb;
150     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
151     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
152
153     /* Avoid stupid compiler warnings */
154     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
155     j_coord_offsetA = 0;
156     j_coord_offsetB = 0;
157     j_coord_offsetC = 0;
158     j_coord_offsetD = 0;
159     j_coord_offsetE = 0;
160     j_coord_offsetF = 0;
161     j_coord_offsetG = 0;
162     j_coord_offsetH = 0;
163
164     outeriter        = 0;
165     inneriter        = 0;
166
167     for(iidx=0;iidx<4*DIM;iidx++)
168     {
169         scratch[iidx] = 0.0;
170     }
171
172     /* Start outer loop over neighborlists */
173     for(iidx=0; iidx<nri; iidx++)
174     {
175         /* Load shift vector for this list */
176         i_shift_offset   = DIM*shiftidx[iidx];
177
178         /* Load limits for loop over neighbors */
179         j_index_start    = jindex[iidx];
180         j_index_end      = jindex[iidx+1];
181
182         /* Get outer coordinate index */
183         inr              = iinr[iidx];
184         i_coord_offset   = DIM*inr;
185
186         /* Load i particle coords and add shift vector */
187         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
188                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
189
190         fix1             = _mm256_setzero_ps();
191         fiy1             = _mm256_setzero_ps();
192         fiz1             = _mm256_setzero_ps();
193         fix2             = _mm256_setzero_ps();
194         fiy2             = _mm256_setzero_ps();
195         fiz2             = _mm256_setzero_ps();
196         fix3             = _mm256_setzero_ps();
197         fiy3             = _mm256_setzero_ps();
198         fiz3             = _mm256_setzero_ps();
199
200         /* Reset potential sums */
201         velecsum         = _mm256_setzero_ps();
202
203         /* Start inner kernel loop */
204         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
205         {
206
207             /* Get j neighbor index, and coordinate index */
208             jnrA             = jjnr[jidx];
209             jnrB             = jjnr[jidx+1];
210             jnrC             = jjnr[jidx+2];
211             jnrD             = jjnr[jidx+3];
212             jnrE             = jjnr[jidx+4];
213             jnrF             = jjnr[jidx+5];
214             jnrG             = jjnr[jidx+6];
215             jnrH             = jjnr[jidx+7];
216             j_coord_offsetA  = DIM*jnrA;
217             j_coord_offsetB  = DIM*jnrB;
218             j_coord_offsetC  = DIM*jnrC;
219             j_coord_offsetD  = DIM*jnrD;
220             j_coord_offsetE  = DIM*jnrE;
221             j_coord_offsetF  = DIM*jnrF;
222             j_coord_offsetG  = DIM*jnrG;
223             j_coord_offsetH  = DIM*jnrH;
224
225             /* load j atom coordinates */
226             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
227                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
228                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
229                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
230                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
231
232             /* Calculate displacement vector */
233             dx11             = _mm256_sub_ps(ix1,jx1);
234             dy11             = _mm256_sub_ps(iy1,jy1);
235             dz11             = _mm256_sub_ps(iz1,jz1);
236             dx12             = _mm256_sub_ps(ix1,jx2);
237             dy12             = _mm256_sub_ps(iy1,jy2);
238             dz12             = _mm256_sub_ps(iz1,jz2);
239             dx13             = _mm256_sub_ps(ix1,jx3);
240             dy13             = _mm256_sub_ps(iy1,jy3);
241             dz13             = _mm256_sub_ps(iz1,jz3);
242             dx21             = _mm256_sub_ps(ix2,jx1);
243             dy21             = _mm256_sub_ps(iy2,jy1);
244             dz21             = _mm256_sub_ps(iz2,jz1);
245             dx22             = _mm256_sub_ps(ix2,jx2);
246             dy22             = _mm256_sub_ps(iy2,jy2);
247             dz22             = _mm256_sub_ps(iz2,jz2);
248             dx23             = _mm256_sub_ps(ix2,jx3);
249             dy23             = _mm256_sub_ps(iy2,jy3);
250             dz23             = _mm256_sub_ps(iz2,jz3);
251             dx31             = _mm256_sub_ps(ix3,jx1);
252             dy31             = _mm256_sub_ps(iy3,jy1);
253             dz31             = _mm256_sub_ps(iz3,jz1);
254             dx32             = _mm256_sub_ps(ix3,jx2);
255             dy32             = _mm256_sub_ps(iy3,jy2);
256             dz32             = _mm256_sub_ps(iz3,jz2);
257             dx33             = _mm256_sub_ps(ix3,jx3);
258             dy33             = _mm256_sub_ps(iy3,jy3);
259             dz33             = _mm256_sub_ps(iz3,jz3);
260
261             /* Calculate squared distance and things based on it */
262             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
263             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
264             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
265             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
266             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
267             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
268             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
269             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
270             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
271
272             rinv11           = avx256_invsqrt_f(rsq11);
273             rinv12           = avx256_invsqrt_f(rsq12);
274             rinv13           = avx256_invsqrt_f(rsq13);
275             rinv21           = avx256_invsqrt_f(rsq21);
276             rinv22           = avx256_invsqrt_f(rsq22);
277             rinv23           = avx256_invsqrt_f(rsq23);
278             rinv31           = avx256_invsqrt_f(rsq31);
279             rinv32           = avx256_invsqrt_f(rsq32);
280             rinv33           = avx256_invsqrt_f(rsq33);
281
282             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
283             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
284             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
285             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
286             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
287             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
288             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
289             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
290             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
291
292             fjx1             = _mm256_setzero_ps();
293             fjy1             = _mm256_setzero_ps();
294             fjz1             = _mm256_setzero_ps();
295             fjx2             = _mm256_setzero_ps();
296             fjy2             = _mm256_setzero_ps();
297             fjz2             = _mm256_setzero_ps();
298             fjx3             = _mm256_setzero_ps();
299             fjy3             = _mm256_setzero_ps();
300             fjz3             = _mm256_setzero_ps();
301
302             /**************************
303              * CALCULATE INTERACTIONS *
304              **************************/
305
306             if (gmx_mm256_any_lt(rsq11,rcutoff2))
307             {
308
309             /* REACTION-FIELD ELECTROSTATICS */
310             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
311             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
312
313             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
314
315             /* Update potential sum for this i atom from the interaction with this j atom. */
316             velec            = _mm256_and_ps(velec,cutoff_mask);
317             velecsum         = _mm256_add_ps(velecsum,velec);
318
319             fscal            = felec;
320
321             fscal            = _mm256_and_ps(fscal,cutoff_mask);
322
323             /* Calculate temporary vectorial force */
324             tx               = _mm256_mul_ps(fscal,dx11);
325             ty               = _mm256_mul_ps(fscal,dy11);
326             tz               = _mm256_mul_ps(fscal,dz11);
327
328             /* Update vectorial force */
329             fix1             = _mm256_add_ps(fix1,tx);
330             fiy1             = _mm256_add_ps(fiy1,ty);
331             fiz1             = _mm256_add_ps(fiz1,tz);
332
333             fjx1             = _mm256_add_ps(fjx1,tx);
334             fjy1             = _mm256_add_ps(fjy1,ty);
335             fjz1             = _mm256_add_ps(fjz1,tz);
336
337             }
338
339             /**************************
340              * CALCULATE INTERACTIONS *
341              **************************/
342
343             if (gmx_mm256_any_lt(rsq12,rcutoff2))
344             {
345
346             /* REACTION-FIELD ELECTROSTATICS */
347             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
348             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
349
350             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             velec            = _mm256_and_ps(velec,cutoff_mask);
354             velecsum         = _mm256_add_ps(velecsum,velec);
355
356             fscal            = felec;
357
358             fscal            = _mm256_and_ps(fscal,cutoff_mask);
359
360             /* Calculate temporary vectorial force */
361             tx               = _mm256_mul_ps(fscal,dx12);
362             ty               = _mm256_mul_ps(fscal,dy12);
363             tz               = _mm256_mul_ps(fscal,dz12);
364
365             /* Update vectorial force */
366             fix1             = _mm256_add_ps(fix1,tx);
367             fiy1             = _mm256_add_ps(fiy1,ty);
368             fiz1             = _mm256_add_ps(fiz1,tz);
369
370             fjx2             = _mm256_add_ps(fjx2,tx);
371             fjy2             = _mm256_add_ps(fjy2,ty);
372             fjz2             = _mm256_add_ps(fjz2,tz);
373
374             }
375
376             /**************************
377              * CALCULATE INTERACTIONS *
378              **************************/
379
380             if (gmx_mm256_any_lt(rsq13,rcutoff2))
381             {
382
383             /* REACTION-FIELD ELECTROSTATICS */
384             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
385             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
386
387             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velec            = _mm256_and_ps(velec,cutoff_mask);
391             velecsum         = _mm256_add_ps(velecsum,velec);
392
393             fscal            = felec;
394
395             fscal            = _mm256_and_ps(fscal,cutoff_mask);
396
397             /* Calculate temporary vectorial force */
398             tx               = _mm256_mul_ps(fscal,dx13);
399             ty               = _mm256_mul_ps(fscal,dy13);
400             tz               = _mm256_mul_ps(fscal,dz13);
401
402             /* Update vectorial force */
403             fix1             = _mm256_add_ps(fix1,tx);
404             fiy1             = _mm256_add_ps(fiy1,ty);
405             fiz1             = _mm256_add_ps(fiz1,tz);
406
407             fjx3             = _mm256_add_ps(fjx3,tx);
408             fjy3             = _mm256_add_ps(fjy3,ty);
409             fjz3             = _mm256_add_ps(fjz3,tz);
410
411             }
412
413             /**************************
414              * CALCULATE INTERACTIONS *
415              **************************/
416
417             if (gmx_mm256_any_lt(rsq21,rcutoff2))
418             {
419
420             /* REACTION-FIELD ELECTROSTATICS */
421             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
422             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
423
424             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
425
426             /* Update potential sum for this i atom from the interaction with this j atom. */
427             velec            = _mm256_and_ps(velec,cutoff_mask);
428             velecsum         = _mm256_add_ps(velecsum,velec);
429
430             fscal            = felec;
431
432             fscal            = _mm256_and_ps(fscal,cutoff_mask);
433
434             /* Calculate temporary vectorial force */
435             tx               = _mm256_mul_ps(fscal,dx21);
436             ty               = _mm256_mul_ps(fscal,dy21);
437             tz               = _mm256_mul_ps(fscal,dz21);
438
439             /* Update vectorial force */
440             fix2             = _mm256_add_ps(fix2,tx);
441             fiy2             = _mm256_add_ps(fiy2,ty);
442             fiz2             = _mm256_add_ps(fiz2,tz);
443
444             fjx1             = _mm256_add_ps(fjx1,tx);
445             fjy1             = _mm256_add_ps(fjy1,ty);
446             fjz1             = _mm256_add_ps(fjz1,tz);
447
448             }
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             if (gmx_mm256_any_lt(rsq22,rcutoff2))
455             {
456
457             /* REACTION-FIELD ELECTROSTATICS */
458             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
459             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
460
461             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
462
463             /* Update potential sum for this i atom from the interaction with this j atom. */
464             velec            = _mm256_and_ps(velec,cutoff_mask);
465             velecsum         = _mm256_add_ps(velecsum,velec);
466
467             fscal            = felec;
468
469             fscal            = _mm256_and_ps(fscal,cutoff_mask);
470
471             /* Calculate temporary vectorial force */
472             tx               = _mm256_mul_ps(fscal,dx22);
473             ty               = _mm256_mul_ps(fscal,dy22);
474             tz               = _mm256_mul_ps(fscal,dz22);
475
476             /* Update vectorial force */
477             fix2             = _mm256_add_ps(fix2,tx);
478             fiy2             = _mm256_add_ps(fiy2,ty);
479             fiz2             = _mm256_add_ps(fiz2,tz);
480
481             fjx2             = _mm256_add_ps(fjx2,tx);
482             fjy2             = _mm256_add_ps(fjy2,ty);
483             fjz2             = _mm256_add_ps(fjz2,tz);
484
485             }
486
487             /**************************
488              * CALCULATE INTERACTIONS *
489              **************************/
490
491             if (gmx_mm256_any_lt(rsq23,rcutoff2))
492             {
493
494             /* REACTION-FIELD ELECTROSTATICS */
495             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
496             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
497
498             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
499
500             /* Update potential sum for this i atom from the interaction with this j atom. */
501             velec            = _mm256_and_ps(velec,cutoff_mask);
502             velecsum         = _mm256_add_ps(velecsum,velec);
503
504             fscal            = felec;
505
506             fscal            = _mm256_and_ps(fscal,cutoff_mask);
507
508             /* Calculate temporary vectorial force */
509             tx               = _mm256_mul_ps(fscal,dx23);
510             ty               = _mm256_mul_ps(fscal,dy23);
511             tz               = _mm256_mul_ps(fscal,dz23);
512
513             /* Update vectorial force */
514             fix2             = _mm256_add_ps(fix2,tx);
515             fiy2             = _mm256_add_ps(fiy2,ty);
516             fiz2             = _mm256_add_ps(fiz2,tz);
517
518             fjx3             = _mm256_add_ps(fjx3,tx);
519             fjy3             = _mm256_add_ps(fjy3,ty);
520             fjz3             = _mm256_add_ps(fjz3,tz);
521
522             }
523
524             /**************************
525              * CALCULATE INTERACTIONS *
526              **************************/
527
528             if (gmx_mm256_any_lt(rsq31,rcutoff2))
529             {
530
531             /* REACTION-FIELD ELECTROSTATICS */
532             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
533             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
534
535             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
536
537             /* Update potential sum for this i atom from the interaction with this j atom. */
538             velec            = _mm256_and_ps(velec,cutoff_mask);
539             velecsum         = _mm256_add_ps(velecsum,velec);
540
541             fscal            = felec;
542
543             fscal            = _mm256_and_ps(fscal,cutoff_mask);
544
545             /* Calculate temporary vectorial force */
546             tx               = _mm256_mul_ps(fscal,dx31);
547             ty               = _mm256_mul_ps(fscal,dy31);
548             tz               = _mm256_mul_ps(fscal,dz31);
549
550             /* Update vectorial force */
551             fix3             = _mm256_add_ps(fix3,tx);
552             fiy3             = _mm256_add_ps(fiy3,ty);
553             fiz3             = _mm256_add_ps(fiz3,tz);
554
555             fjx1             = _mm256_add_ps(fjx1,tx);
556             fjy1             = _mm256_add_ps(fjy1,ty);
557             fjz1             = _mm256_add_ps(fjz1,tz);
558
559             }
560
561             /**************************
562              * CALCULATE INTERACTIONS *
563              **************************/
564
565             if (gmx_mm256_any_lt(rsq32,rcutoff2))
566             {
567
568             /* REACTION-FIELD ELECTROSTATICS */
569             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
570             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
571
572             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
573
574             /* Update potential sum for this i atom from the interaction with this j atom. */
575             velec            = _mm256_and_ps(velec,cutoff_mask);
576             velecsum         = _mm256_add_ps(velecsum,velec);
577
578             fscal            = felec;
579
580             fscal            = _mm256_and_ps(fscal,cutoff_mask);
581
582             /* Calculate temporary vectorial force */
583             tx               = _mm256_mul_ps(fscal,dx32);
584             ty               = _mm256_mul_ps(fscal,dy32);
585             tz               = _mm256_mul_ps(fscal,dz32);
586
587             /* Update vectorial force */
588             fix3             = _mm256_add_ps(fix3,tx);
589             fiy3             = _mm256_add_ps(fiy3,ty);
590             fiz3             = _mm256_add_ps(fiz3,tz);
591
592             fjx2             = _mm256_add_ps(fjx2,tx);
593             fjy2             = _mm256_add_ps(fjy2,ty);
594             fjz2             = _mm256_add_ps(fjz2,tz);
595
596             }
597
598             /**************************
599              * CALCULATE INTERACTIONS *
600              **************************/
601
602             if (gmx_mm256_any_lt(rsq33,rcutoff2))
603             {
604
605             /* REACTION-FIELD ELECTROSTATICS */
606             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
607             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
608
609             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
610
611             /* Update potential sum for this i atom from the interaction with this j atom. */
612             velec            = _mm256_and_ps(velec,cutoff_mask);
613             velecsum         = _mm256_add_ps(velecsum,velec);
614
615             fscal            = felec;
616
617             fscal            = _mm256_and_ps(fscal,cutoff_mask);
618
619             /* Calculate temporary vectorial force */
620             tx               = _mm256_mul_ps(fscal,dx33);
621             ty               = _mm256_mul_ps(fscal,dy33);
622             tz               = _mm256_mul_ps(fscal,dz33);
623
624             /* Update vectorial force */
625             fix3             = _mm256_add_ps(fix3,tx);
626             fiy3             = _mm256_add_ps(fiy3,ty);
627             fiz3             = _mm256_add_ps(fiz3,tz);
628
629             fjx3             = _mm256_add_ps(fjx3,tx);
630             fjy3             = _mm256_add_ps(fjy3,ty);
631             fjz3             = _mm256_add_ps(fjz3,tz);
632
633             }
634
635             fjptrA             = f+j_coord_offsetA;
636             fjptrB             = f+j_coord_offsetB;
637             fjptrC             = f+j_coord_offsetC;
638             fjptrD             = f+j_coord_offsetD;
639             fjptrE             = f+j_coord_offsetE;
640             fjptrF             = f+j_coord_offsetF;
641             fjptrG             = f+j_coord_offsetG;
642             fjptrH             = f+j_coord_offsetH;
643
644             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
645                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
646                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
647
648             /* Inner loop uses 324 flops */
649         }
650
651         if(jidx<j_index_end)
652         {
653
654             /* Get j neighbor index, and coordinate index */
655             jnrlistA         = jjnr[jidx];
656             jnrlistB         = jjnr[jidx+1];
657             jnrlistC         = jjnr[jidx+2];
658             jnrlistD         = jjnr[jidx+3];
659             jnrlistE         = jjnr[jidx+4];
660             jnrlistF         = jjnr[jidx+5];
661             jnrlistG         = jjnr[jidx+6];
662             jnrlistH         = jjnr[jidx+7];
663             /* Sign of each element will be negative for non-real atoms.
664              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
665              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
666              */
667             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
668                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
669                                             
670             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
671             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
672             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
673             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
674             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
675             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
676             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
677             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
678             j_coord_offsetA  = DIM*jnrA;
679             j_coord_offsetB  = DIM*jnrB;
680             j_coord_offsetC  = DIM*jnrC;
681             j_coord_offsetD  = DIM*jnrD;
682             j_coord_offsetE  = DIM*jnrE;
683             j_coord_offsetF  = DIM*jnrF;
684             j_coord_offsetG  = DIM*jnrG;
685             j_coord_offsetH  = DIM*jnrH;
686
687             /* load j atom coordinates */
688             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
689                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
690                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
691                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
692                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
693
694             /* Calculate displacement vector */
695             dx11             = _mm256_sub_ps(ix1,jx1);
696             dy11             = _mm256_sub_ps(iy1,jy1);
697             dz11             = _mm256_sub_ps(iz1,jz1);
698             dx12             = _mm256_sub_ps(ix1,jx2);
699             dy12             = _mm256_sub_ps(iy1,jy2);
700             dz12             = _mm256_sub_ps(iz1,jz2);
701             dx13             = _mm256_sub_ps(ix1,jx3);
702             dy13             = _mm256_sub_ps(iy1,jy3);
703             dz13             = _mm256_sub_ps(iz1,jz3);
704             dx21             = _mm256_sub_ps(ix2,jx1);
705             dy21             = _mm256_sub_ps(iy2,jy1);
706             dz21             = _mm256_sub_ps(iz2,jz1);
707             dx22             = _mm256_sub_ps(ix2,jx2);
708             dy22             = _mm256_sub_ps(iy2,jy2);
709             dz22             = _mm256_sub_ps(iz2,jz2);
710             dx23             = _mm256_sub_ps(ix2,jx3);
711             dy23             = _mm256_sub_ps(iy2,jy3);
712             dz23             = _mm256_sub_ps(iz2,jz3);
713             dx31             = _mm256_sub_ps(ix3,jx1);
714             dy31             = _mm256_sub_ps(iy3,jy1);
715             dz31             = _mm256_sub_ps(iz3,jz1);
716             dx32             = _mm256_sub_ps(ix3,jx2);
717             dy32             = _mm256_sub_ps(iy3,jy2);
718             dz32             = _mm256_sub_ps(iz3,jz2);
719             dx33             = _mm256_sub_ps(ix3,jx3);
720             dy33             = _mm256_sub_ps(iy3,jy3);
721             dz33             = _mm256_sub_ps(iz3,jz3);
722
723             /* Calculate squared distance and things based on it */
724             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
725             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
726             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
727             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
728             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
729             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
730             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
731             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
732             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
733
734             rinv11           = avx256_invsqrt_f(rsq11);
735             rinv12           = avx256_invsqrt_f(rsq12);
736             rinv13           = avx256_invsqrt_f(rsq13);
737             rinv21           = avx256_invsqrt_f(rsq21);
738             rinv22           = avx256_invsqrt_f(rsq22);
739             rinv23           = avx256_invsqrt_f(rsq23);
740             rinv31           = avx256_invsqrt_f(rsq31);
741             rinv32           = avx256_invsqrt_f(rsq32);
742             rinv33           = avx256_invsqrt_f(rsq33);
743
744             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
745             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
746             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
747             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
748             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
749             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
750             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
751             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
752             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
753
754             fjx1             = _mm256_setzero_ps();
755             fjy1             = _mm256_setzero_ps();
756             fjz1             = _mm256_setzero_ps();
757             fjx2             = _mm256_setzero_ps();
758             fjy2             = _mm256_setzero_ps();
759             fjz2             = _mm256_setzero_ps();
760             fjx3             = _mm256_setzero_ps();
761             fjy3             = _mm256_setzero_ps();
762             fjz3             = _mm256_setzero_ps();
763
764             /**************************
765              * CALCULATE INTERACTIONS *
766              **************************/
767
768             if (gmx_mm256_any_lt(rsq11,rcutoff2))
769             {
770
771             /* REACTION-FIELD ELECTROSTATICS */
772             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
773             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
774
775             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
776
777             /* Update potential sum for this i atom from the interaction with this j atom. */
778             velec            = _mm256_and_ps(velec,cutoff_mask);
779             velec            = _mm256_andnot_ps(dummy_mask,velec);
780             velecsum         = _mm256_add_ps(velecsum,velec);
781
782             fscal            = felec;
783
784             fscal            = _mm256_and_ps(fscal,cutoff_mask);
785
786             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
787
788             /* Calculate temporary vectorial force */
789             tx               = _mm256_mul_ps(fscal,dx11);
790             ty               = _mm256_mul_ps(fscal,dy11);
791             tz               = _mm256_mul_ps(fscal,dz11);
792
793             /* Update vectorial force */
794             fix1             = _mm256_add_ps(fix1,tx);
795             fiy1             = _mm256_add_ps(fiy1,ty);
796             fiz1             = _mm256_add_ps(fiz1,tz);
797
798             fjx1             = _mm256_add_ps(fjx1,tx);
799             fjy1             = _mm256_add_ps(fjy1,ty);
800             fjz1             = _mm256_add_ps(fjz1,tz);
801
802             }
803
804             /**************************
805              * CALCULATE INTERACTIONS *
806              **************************/
807
808             if (gmx_mm256_any_lt(rsq12,rcutoff2))
809             {
810
811             /* REACTION-FIELD ELECTROSTATICS */
812             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
813             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
814
815             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
816
817             /* Update potential sum for this i atom from the interaction with this j atom. */
818             velec            = _mm256_and_ps(velec,cutoff_mask);
819             velec            = _mm256_andnot_ps(dummy_mask,velec);
820             velecsum         = _mm256_add_ps(velecsum,velec);
821
822             fscal            = felec;
823
824             fscal            = _mm256_and_ps(fscal,cutoff_mask);
825
826             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
827
828             /* Calculate temporary vectorial force */
829             tx               = _mm256_mul_ps(fscal,dx12);
830             ty               = _mm256_mul_ps(fscal,dy12);
831             tz               = _mm256_mul_ps(fscal,dz12);
832
833             /* Update vectorial force */
834             fix1             = _mm256_add_ps(fix1,tx);
835             fiy1             = _mm256_add_ps(fiy1,ty);
836             fiz1             = _mm256_add_ps(fiz1,tz);
837
838             fjx2             = _mm256_add_ps(fjx2,tx);
839             fjy2             = _mm256_add_ps(fjy2,ty);
840             fjz2             = _mm256_add_ps(fjz2,tz);
841
842             }
843
844             /**************************
845              * CALCULATE INTERACTIONS *
846              **************************/
847
848             if (gmx_mm256_any_lt(rsq13,rcutoff2))
849             {
850
851             /* REACTION-FIELD ELECTROSTATICS */
852             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
853             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
854
855             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
856
857             /* Update potential sum for this i atom from the interaction with this j atom. */
858             velec            = _mm256_and_ps(velec,cutoff_mask);
859             velec            = _mm256_andnot_ps(dummy_mask,velec);
860             velecsum         = _mm256_add_ps(velecsum,velec);
861
862             fscal            = felec;
863
864             fscal            = _mm256_and_ps(fscal,cutoff_mask);
865
866             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
867
868             /* Calculate temporary vectorial force */
869             tx               = _mm256_mul_ps(fscal,dx13);
870             ty               = _mm256_mul_ps(fscal,dy13);
871             tz               = _mm256_mul_ps(fscal,dz13);
872
873             /* Update vectorial force */
874             fix1             = _mm256_add_ps(fix1,tx);
875             fiy1             = _mm256_add_ps(fiy1,ty);
876             fiz1             = _mm256_add_ps(fiz1,tz);
877
878             fjx3             = _mm256_add_ps(fjx3,tx);
879             fjy3             = _mm256_add_ps(fjy3,ty);
880             fjz3             = _mm256_add_ps(fjz3,tz);
881
882             }
883
884             /**************************
885              * CALCULATE INTERACTIONS *
886              **************************/
887
888             if (gmx_mm256_any_lt(rsq21,rcutoff2))
889             {
890
891             /* REACTION-FIELD ELECTROSTATICS */
892             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
893             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
894
895             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
896
897             /* Update potential sum for this i atom from the interaction with this j atom. */
898             velec            = _mm256_and_ps(velec,cutoff_mask);
899             velec            = _mm256_andnot_ps(dummy_mask,velec);
900             velecsum         = _mm256_add_ps(velecsum,velec);
901
902             fscal            = felec;
903
904             fscal            = _mm256_and_ps(fscal,cutoff_mask);
905
906             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
907
908             /* Calculate temporary vectorial force */
909             tx               = _mm256_mul_ps(fscal,dx21);
910             ty               = _mm256_mul_ps(fscal,dy21);
911             tz               = _mm256_mul_ps(fscal,dz21);
912
913             /* Update vectorial force */
914             fix2             = _mm256_add_ps(fix2,tx);
915             fiy2             = _mm256_add_ps(fiy2,ty);
916             fiz2             = _mm256_add_ps(fiz2,tz);
917
918             fjx1             = _mm256_add_ps(fjx1,tx);
919             fjy1             = _mm256_add_ps(fjy1,ty);
920             fjz1             = _mm256_add_ps(fjz1,tz);
921
922             }
923
924             /**************************
925              * CALCULATE INTERACTIONS *
926              **************************/
927
928             if (gmx_mm256_any_lt(rsq22,rcutoff2))
929             {
930
931             /* REACTION-FIELD ELECTROSTATICS */
932             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
933             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
934
935             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
936
937             /* Update potential sum for this i atom from the interaction with this j atom. */
938             velec            = _mm256_and_ps(velec,cutoff_mask);
939             velec            = _mm256_andnot_ps(dummy_mask,velec);
940             velecsum         = _mm256_add_ps(velecsum,velec);
941
942             fscal            = felec;
943
944             fscal            = _mm256_and_ps(fscal,cutoff_mask);
945
946             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
947
948             /* Calculate temporary vectorial force */
949             tx               = _mm256_mul_ps(fscal,dx22);
950             ty               = _mm256_mul_ps(fscal,dy22);
951             tz               = _mm256_mul_ps(fscal,dz22);
952
953             /* Update vectorial force */
954             fix2             = _mm256_add_ps(fix2,tx);
955             fiy2             = _mm256_add_ps(fiy2,ty);
956             fiz2             = _mm256_add_ps(fiz2,tz);
957
958             fjx2             = _mm256_add_ps(fjx2,tx);
959             fjy2             = _mm256_add_ps(fjy2,ty);
960             fjz2             = _mm256_add_ps(fjz2,tz);
961
962             }
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             if (gmx_mm256_any_lt(rsq23,rcutoff2))
969             {
970
971             /* REACTION-FIELD ELECTROSTATICS */
972             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
973             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
974
975             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
976
977             /* Update potential sum for this i atom from the interaction with this j atom. */
978             velec            = _mm256_and_ps(velec,cutoff_mask);
979             velec            = _mm256_andnot_ps(dummy_mask,velec);
980             velecsum         = _mm256_add_ps(velecsum,velec);
981
982             fscal            = felec;
983
984             fscal            = _mm256_and_ps(fscal,cutoff_mask);
985
986             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm256_mul_ps(fscal,dx23);
990             ty               = _mm256_mul_ps(fscal,dy23);
991             tz               = _mm256_mul_ps(fscal,dz23);
992
993             /* Update vectorial force */
994             fix2             = _mm256_add_ps(fix2,tx);
995             fiy2             = _mm256_add_ps(fiy2,ty);
996             fiz2             = _mm256_add_ps(fiz2,tz);
997
998             fjx3             = _mm256_add_ps(fjx3,tx);
999             fjy3             = _mm256_add_ps(fjy3,ty);
1000             fjz3             = _mm256_add_ps(fjz3,tz);
1001
1002             }
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1009             {
1010
1011             /* REACTION-FIELD ELECTROSTATICS */
1012             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
1013             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1014
1015             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1016
1017             /* Update potential sum for this i atom from the interaction with this j atom. */
1018             velec            = _mm256_and_ps(velec,cutoff_mask);
1019             velec            = _mm256_andnot_ps(dummy_mask,velec);
1020             velecsum         = _mm256_add_ps(velecsum,velec);
1021
1022             fscal            = felec;
1023
1024             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1025
1026             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1027
1028             /* Calculate temporary vectorial force */
1029             tx               = _mm256_mul_ps(fscal,dx31);
1030             ty               = _mm256_mul_ps(fscal,dy31);
1031             tz               = _mm256_mul_ps(fscal,dz31);
1032
1033             /* Update vectorial force */
1034             fix3             = _mm256_add_ps(fix3,tx);
1035             fiy3             = _mm256_add_ps(fiy3,ty);
1036             fiz3             = _mm256_add_ps(fiz3,tz);
1037
1038             fjx1             = _mm256_add_ps(fjx1,tx);
1039             fjy1             = _mm256_add_ps(fjy1,ty);
1040             fjz1             = _mm256_add_ps(fjz1,tz);
1041
1042             }
1043
1044             /**************************
1045              * CALCULATE INTERACTIONS *
1046              **************************/
1047
1048             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1049             {
1050
1051             /* REACTION-FIELD ELECTROSTATICS */
1052             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
1053             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1054
1055             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1056
1057             /* Update potential sum for this i atom from the interaction with this j atom. */
1058             velec            = _mm256_and_ps(velec,cutoff_mask);
1059             velec            = _mm256_andnot_ps(dummy_mask,velec);
1060             velecsum         = _mm256_add_ps(velecsum,velec);
1061
1062             fscal            = felec;
1063
1064             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1065
1066             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1067
1068             /* Calculate temporary vectorial force */
1069             tx               = _mm256_mul_ps(fscal,dx32);
1070             ty               = _mm256_mul_ps(fscal,dy32);
1071             tz               = _mm256_mul_ps(fscal,dz32);
1072
1073             /* Update vectorial force */
1074             fix3             = _mm256_add_ps(fix3,tx);
1075             fiy3             = _mm256_add_ps(fiy3,ty);
1076             fiz3             = _mm256_add_ps(fiz3,tz);
1077
1078             fjx2             = _mm256_add_ps(fjx2,tx);
1079             fjy2             = _mm256_add_ps(fjy2,ty);
1080             fjz2             = _mm256_add_ps(fjz2,tz);
1081
1082             }
1083
1084             /**************************
1085              * CALCULATE INTERACTIONS *
1086              **************************/
1087
1088             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1089             {
1090
1091             /* REACTION-FIELD ELECTROSTATICS */
1092             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
1093             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1094
1095             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1096
1097             /* Update potential sum for this i atom from the interaction with this j atom. */
1098             velec            = _mm256_and_ps(velec,cutoff_mask);
1099             velec            = _mm256_andnot_ps(dummy_mask,velec);
1100             velecsum         = _mm256_add_ps(velecsum,velec);
1101
1102             fscal            = felec;
1103
1104             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1105
1106             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1107
1108             /* Calculate temporary vectorial force */
1109             tx               = _mm256_mul_ps(fscal,dx33);
1110             ty               = _mm256_mul_ps(fscal,dy33);
1111             tz               = _mm256_mul_ps(fscal,dz33);
1112
1113             /* Update vectorial force */
1114             fix3             = _mm256_add_ps(fix3,tx);
1115             fiy3             = _mm256_add_ps(fiy3,ty);
1116             fiz3             = _mm256_add_ps(fiz3,tz);
1117
1118             fjx3             = _mm256_add_ps(fjx3,tx);
1119             fjy3             = _mm256_add_ps(fjy3,ty);
1120             fjz3             = _mm256_add_ps(fjz3,tz);
1121
1122             }
1123
1124             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1125             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1126             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1127             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1128             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1129             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1130             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1131             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1132
1133             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1134                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1135                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1136
1137             /* Inner loop uses 324 flops */
1138         }
1139
1140         /* End of innermost loop */
1141
1142         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1143                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
1144
1145         ggid                        = gid[iidx];
1146         /* Update potential energies */
1147         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1148
1149         /* Increment number of inner iterations */
1150         inneriter                  += j_index_end - j_index_start;
1151
1152         /* Outer loop uses 19 flops */
1153     }
1154
1155     /* Increment number of outer iterations */
1156     outeriter        += nri;
1157
1158     /* Update outer/inner flops */
1159
1160     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*324);
1161 }
1162 /*
1163  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single
1164  * Electrostatics interaction: ReactionField
1165  * VdW interaction:            None
1166  * Geometry:                   Water4-Water4
1167  * Calculate force/pot:        Force
1168  */
1169 void
1170 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single
1171                     (t_nblist                    * gmx_restrict       nlist,
1172                      rvec                        * gmx_restrict          xx,
1173                      rvec                        * gmx_restrict          ff,
1174                      struct t_forcerec           * gmx_restrict          fr,
1175                      t_mdatoms                   * gmx_restrict     mdatoms,
1176                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1177                      t_nrnb                      * gmx_restrict        nrnb)
1178 {
1179     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1180      * just 0 for non-waters.
1181      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1182      * jnr indices corresponding to data put in the four positions in the SIMD register.
1183      */
1184     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1185     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1186     int              jnrA,jnrB,jnrC,jnrD;
1187     int              jnrE,jnrF,jnrG,jnrH;
1188     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1189     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1190     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1191     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1192     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1193     real             rcutoff_scalar;
1194     real             *shiftvec,*fshift,*x,*f;
1195     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1196     real             scratch[4*DIM];
1197     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1198     real *           vdwioffsetptr1;
1199     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1200     real *           vdwioffsetptr2;
1201     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1202     real *           vdwioffsetptr3;
1203     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1204     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1205     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1206     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1207     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1208     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1209     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1210     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1211     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1212     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1213     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1214     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1215     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1216     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1217     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1218     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1219     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1220     real             *charge;
1221     __m256           dummy_mask,cutoff_mask;
1222     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1223     __m256           one     = _mm256_set1_ps(1.0);
1224     __m256           two     = _mm256_set1_ps(2.0);
1225     x                = xx[0];
1226     f                = ff[0];
1227
1228     nri              = nlist->nri;
1229     iinr             = nlist->iinr;
1230     jindex           = nlist->jindex;
1231     jjnr             = nlist->jjnr;
1232     shiftidx         = nlist->shift;
1233     gid              = nlist->gid;
1234     shiftvec         = fr->shift_vec[0];
1235     fshift           = fr->fshift[0];
1236     facel            = _mm256_set1_ps(fr->ic->epsfac);
1237     charge           = mdatoms->chargeA;
1238     krf              = _mm256_set1_ps(fr->ic->k_rf);
1239     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1240     crf              = _mm256_set1_ps(fr->ic->c_rf);
1241
1242     /* Setup water-specific parameters */
1243     inr              = nlist->iinr[0];
1244     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1245     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1246     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1247
1248     jq1              = _mm256_set1_ps(charge[inr+1]);
1249     jq2              = _mm256_set1_ps(charge[inr+2]);
1250     jq3              = _mm256_set1_ps(charge[inr+3]);
1251     qq11             = _mm256_mul_ps(iq1,jq1);
1252     qq12             = _mm256_mul_ps(iq1,jq2);
1253     qq13             = _mm256_mul_ps(iq1,jq3);
1254     qq21             = _mm256_mul_ps(iq2,jq1);
1255     qq22             = _mm256_mul_ps(iq2,jq2);
1256     qq23             = _mm256_mul_ps(iq2,jq3);
1257     qq31             = _mm256_mul_ps(iq3,jq1);
1258     qq32             = _mm256_mul_ps(iq3,jq2);
1259     qq33             = _mm256_mul_ps(iq3,jq3);
1260
1261     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1262     rcutoff_scalar   = fr->ic->rcoulomb;
1263     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1264     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1265
1266     /* Avoid stupid compiler warnings */
1267     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1268     j_coord_offsetA = 0;
1269     j_coord_offsetB = 0;
1270     j_coord_offsetC = 0;
1271     j_coord_offsetD = 0;
1272     j_coord_offsetE = 0;
1273     j_coord_offsetF = 0;
1274     j_coord_offsetG = 0;
1275     j_coord_offsetH = 0;
1276
1277     outeriter        = 0;
1278     inneriter        = 0;
1279
1280     for(iidx=0;iidx<4*DIM;iidx++)
1281     {
1282         scratch[iidx] = 0.0;
1283     }
1284
1285     /* Start outer loop over neighborlists */
1286     for(iidx=0; iidx<nri; iidx++)
1287     {
1288         /* Load shift vector for this list */
1289         i_shift_offset   = DIM*shiftidx[iidx];
1290
1291         /* Load limits for loop over neighbors */
1292         j_index_start    = jindex[iidx];
1293         j_index_end      = jindex[iidx+1];
1294
1295         /* Get outer coordinate index */
1296         inr              = iinr[iidx];
1297         i_coord_offset   = DIM*inr;
1298
1299         /* Load i particle coords and add shift vector */
1300         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1301                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1302
1303         fix1             = _mm256_setzero_ps();
1304         fiy1             = _mm256_setzero_ps();
1305         fiz1             = _mm256_setzero_ps();
1306         fix2             = _mm256_setzero_ps();
1307         fiy2             = _mm256_setzero_ps();
1308         fiz2             = _mm256_setzero_ps();
1309         fix3             = _mm256_setzero_ps();
1310         fiy3             = _mm256_setzero_ps();
1311         fiz3             = _mm256_setzero_ps();
1312
1313         /* Start inner kernel loop */
1314         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1315         {
1316
1317             /* Get j neighbor index, and coordinate index */
1318             jnrA             = jjnr[jidx];
1319             jnrB             = jjnr[jidx+1];
1320             jnrC             = jjnr[jidx+2];
1321             jnrD             = jjnr[jidx+3];
1322             jnrE             = jjnr[jidx+4];
1323             jnrF             = jjnr[jidx+5];
1324             jnrG             = jjnr[jidx+6];
1325             jnrH             = jjnr[jidx+7];
1326             j_coord_offsetA  = DIM*jnrA;
1327             j_coord_offsetB  = DIM*jnrB;
1328             j_coord_offsetC  = DIM*jnrC;
1329             j_coord_offsetD  = DIM*jnrD;
1330             j_coord_offsetE  = DIM*jnrE;
1331             j_coord_offsetF  = DIM*jnrF;
1332             j_coord_offsetG  = DIM*jnrG;
1333             j_coord_offsetH  = DIM*jnrH;
1334
1335             /* load j atom coordinates */
1336             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1337                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1338                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1339                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1340                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1341
1342             /* Calculate displacement vector */
1343             dx11             = _mm256_sub_ps(ix1,jx1);
1344             dy11             = _mm256_sub_ps(iy1,jy1);
1345             dz11             = _mm256_sub_ps(iz1,jz1);
1346             dx12             = _mm256_sub_ps(ix1,jx2);
1347             dy12             = _mm256_sub_ps(iy1,jy2);
1348             dz12             = _mm256_sub_ps(iz1,jz2);
1349             dx13             = _mm256_sub_ps(ix1,jx3);
1350             dy13             = _mm256_sub_ps(iy1,jy3);
1351             dz13             = _mm256_sub_ps(iz1,jz3);
1352             dx21             = _mm256_sub_ps(ix2,jx1);
1353             dy21             = _mm256_sub_ps(iy2,jy1);
1354             dz21             = _mm256_sub_ps(iz2,jz1);
1355             dx22             = _mm256_sub_ps(ix2,jx2);
1356             dy22             = _mm256_sub_ps(iy2,jy2);
1357             dz22             = _mm256_sub_ps(iz2,jz2);
1358             dx23             = _mm256_sub_ps(ix2,jx3);
1359             dy23             = _mm256_sub_ps(iy2,jy3);
1360             dz23             = _mm256_sub_ps(iz2,jz3);
1361             dx31             = _mm256_sub_ps(ix3,jx1);
1362             dy31             = _mm256_sub_ps(iy3,jy1);
1363             dz31             = _mm256_sub_ps(iz3,jz1);
1364             dx32             = _mm256_sub_ps(ix3,jx2);
1365             dy32             = _mm256_sub_ps(iy3,jy2);
1366             dz32             = _mm256_sub_ps(iz3,jz2);
1367             dx33             = _mm256_sub_ps(ix3,jx3);
1368             dy33             = _mm256_sub_ps(iy3,jy3);
1369             dz33             = _mm256_sub_ps(iz3,jz3);
1370
1371             /* Calculate squared distance and things based on it */
1372             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1373             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1374             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1375             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1376             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1377             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1378             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1379             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1380             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1381
1382             rinv11           = avx256_invsqrt_f(rsq11);
1383             rinv12           = avx256_invsqrt_f(rsq12);
1384             rinv13           = avx256_invsqrt_f(rsq13);
1385             rinv21           = avx256_invsqrt_f(rsq21);
1386             rinv22           = avx256_invsqrt_f(rsq22);
1387             rinv23           = avx256_invsqrt_f(rsq23);
1388             rinv31           = avx256_invsqrt_f(rsq31);
1389             rinv32           = avx256_invsqrt_f(rsq32);
1390             rinv33           = avx256_invsqrt_f(rsq33);
1391
1392             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1393             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1394             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1395             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1396             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1397             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1398             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1399             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1400             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1401
1402             fjx1             = _mm256_setzero_ps();
1403             fjy1             = _mm256_setzero_ps();
1404             fjz1             = _mm256_setzero_ps();
1405             fjx2             = _mm256_setzero_ps();
1406             fjy2             = _mm256_setzero_ps();
1407             fjz2             = _mm256_setzero_ps();
1408             fjx3             = _mm256_setzero_ps();
1409             fjy3             = _mm256_setzero_ps();
1410             fjz3             = _mm256_setzero_ps();
1411
1412             /**************************
1413              * CALCULATE INTERACTIONS *
1414              **************************/
1415
1416             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1417             {
1418
1419             /* REACTION-FIELD ELECTROSTATICS */
1420             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1421
1422             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1423
1424             fscal            = felec;
1425
1426             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1427
1428             /* Calculate temporary vectorial force */
1429             tx               = _mm256_mul_ps(fscal,dx11);
1430             ty               = _mm256_mul_ps(fscal,dy11);
1431             tz               = _mm256_mul_ps(fscal,dz11);
1432
1433             /* Update vectorial force */
1434             fix1             = _mm256_add_ps(fix1,tx);
1435             fiy1             = _mm256_add_ps(fiy1,ty);
1436             fiz1             = _mm256_add_ps(fiz1,tz);
1437
1438             fjx1             = _mm256_add_ps(fjx1,tx);
1439             fjy1             = _mm256_add_ps(fjy1,ty);
1440             fjz1             = _mm256_add_ps(fjz1,tz);
1441
1442             }
1443
1444             /**************************
1445              * CALCULATE INTERACTIONS *
1446              **************************/
1447
1448             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1449             {
1450
1451             /* REACTION-FIELD ELECTROSTATICS */
1452             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1453
1454             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1455
1456             fscal            = felec;
1457
1458             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1459
1460             /* Calculate temporary vectorial force */
1461             tx               = _mm256_mul_ps(fscal,dx12);
1462             ty               = _mm256_mul_ps(fscal,dy12);
1463             tz               = _mm256_mul_ps(fscal,dz12);
1464
1465             /* Update vectorial force */
1466             fix1             = _mm256_add_ps(fix1,tx);
1467             fiy1             = _mm256_add_ps(fiy1,ty);
1468             fiz1             = _mm256_add_ps(fiz1,tz);
1469
1470             fjx2             = _mm256_add_ps(fjx2,tx);
1471             fjy2             = _mm256_add_ps(fjy2,ty);
1472             fjz2             = _mm256_add_ps(fjz2,tz);
1473
1474             }
1475
1476             /**************************
1477              * CALCULATE INTERACTIONS *
1478              **************************/
1479
1480             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1481             {
1482
1483             /* REACTION-FIELD ELECTROSTATICS */
1484             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1485
1486             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1487
1488             fscal            = felec;
1489
1490             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1491
1492             /* Calculate temporary vectorial force */
1493             tx               = _mm256_mul_ps(fscal,dx13);
1494             ty               = _mm256_mul_ps(fscal,dy13);
1495             tz               = _mm256_mul_ps(fscal,dz13);
1496
1497             /* Update vectorial force */
1498             fix1             = _mm256_add_ps(fix1,tx);
1499             fiy1             = _mm256_add_ps(fiy1,ty);
1500             fiz1             = _mm256_add_ps(fiz1,tz);
1501
1502             fjx3             = _mm256_add_ps(fjx3,tx);
1503             fjy3             = _mm256_add_ps(fjy3,ty);
1504             fjz3             = _mm256_add_ps(fjz3,tz);
1505
1506             }
1507
1508             /**************************
1509              * CALCULATE INTERACTIONS *
1510              **************************/
1511
1512             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1513             {
1514
1515             /* REACTION-FIELD ELECTROSTATICS */
1516             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1517
1518             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1519
1520             fscal            = felec;
1521
1522             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1523
1524             /* Calculate temporary vectorial force */
1525             tx               = _mm256_mul_ps(fscal,dx21);
1526             ty               = _mm256_mul_ps(fscal,dy21);
1527             tz               = _mm256_mul_ps(fscal,dz21);
1528
1529             /* Update vectorial force */
1530             fix2             = _mm256_add_ps(fix2,tx);
1531             fiy2             = _mm256_add_ps(fiy2,ty);
1532             fiz2             = _mm256_add_ps(fiz2,tz);
1533
1534             fjx1             = _mm256_add_ps(fjx1,tx);
1535             fjy1             = _mm256_add_ps(fjy1,ty);
1536             fjz1             = _mm256_add_ps(fjz1,tz);
1537
1538             }
1539
1540             /**************************
1541              * CALCULATE INTERACTIONS *
1542              **************************/
1543
1544             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1545             {
1546
1547             /* REACTION-FIELD ELECTROSTATICS */
1548             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1549
1550             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1551
1552             fscal            = felec;
1553
1554             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1555
1556             /* Calculate temporary vectorial force */
1557             tx               = _mm256_mul_ps(fscal,dx22);
1558             ty               = _mm256_mul_ps(fscal,dy22);
1559             tz               = _mm256_mul_ps(fscal,dz22);
1560
1561             /* Update vectorial force */
1562             fix2             = _mm256_add_ps(fix2,tx);
1563             fiy2             = _mm256_add_ps(fiy2,ty);
1564             fiz2             = _mm256_add_ps(fiz2,tz);
1565
1566             fjx2             = _mm256_add_ps(fjx2,tx);
1567             fjy2             = _mm256_add_ps(fjy2,ty);
1568             fjz2             = _mm256_add_ps(fjz2,tz);
1569
1570             }
1571
1572             /**************************
1573              * CALCULATE INTERACTIONS *
1574              **************************/
1575
1576             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1577             {
1578
1579             /* REACTION-FIELD ELECTROSTATICS */
1580             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1581
1582             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1583
1584             fscal            = felec;
1585
1586             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1587
1588             /* Calculate temporary vectorial force */
1589             tx               = _mm256_mul_ps(fscal,dx23);
1590             ty               = _mm256_mul_ps(fscal,dy23);
1591             tz               = _mm256_mul_ps(fscal,dz23);
1592
1593             /* Update vectorial force */
1594             fix2             = _mm256_add_ps(fix2,tx);
1595             fiy2             = _mm256_add_ps(fiy2,ty);
1596             fiz2             = _mm256_add_ps(fiz2,tz);
1597
1598             fjx3             = _mm256_add_ps(fjx3,tx);
1599             fjy3             = _mm256_add_ps(fjy3,ty);
1600             fjz3             = _mm256_add_ps(fjz3,tz);
1601
1602             }
1603
1604             /**************************
1605              * CALCULATE INTERACTIONS *
1606              **************************/
1607
1608             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1609             {
1610
1611             /* REACTION-FIELD ELECTROSTATICS */
1612             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1613
1614             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1615
1616             fscal            = felec;
1617
1618             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1619
1620             /* Calculate temporary vectorial force */
1621             tx               = _mm256_mul_ps(fscal,dx31);
1622             ty               = _mm256_mul_ps(fscal,dy31);
1623             tz               = _mm256_mul_ps(fscal,dz31);
1624
1625             /* Update vectorial force */
1626             fix3             = _mm256_add_ps(fix3,tx);
1627             fiy3             = _mm256_add_ps(fiy3,ty);
1628             fiz3             = _mm256_add_ps(fiz3,tz);
1629
1630             fjx1             = _mm256_add_ps(fjx1,tx);
1631             fjy1             = _mm256_add_ps(fjy1,ty);
1632             fjz1             = _mm256_add_ps(fjz1,tz);
1633
1634             }
1635
1636             /**************************
1637              * CALCULATE INTERACTIONS *
1638              **************************/
1639
1640             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1641             {
1642
1643             /* REACTION-FIELD ELECTROSTATICS */
1644             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1645
1646             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1647
1648             fscal            = felec;
1649
1650             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1651
1652             /* Calculate temporary vectorial force */
1653             tx               = _mm256_mul_ps(fscal,dx32);
1654             ty               = _mm256_mul_ps(fscal,dy32);
1655             tz               = _mm256_mul_ps(fscal,dz32);
1656
1657             /* Update vectorial force */
1658             fix3             = _mm256_add_ps(fix3,tx);
1659             fiy3             = _mm256_add_ps(fiy3,ty);
1660             fiz3             = _mm256_add_ps(fiz3,tz);
1661
1662             fjx2             = _mm256_add_ps(fjx2,tx);
1663             fjy2             = _mm256_add_ps(fjy2,ty);
1664             fjz2             = _mm256_add_ps(fjz2,tz);
1665
1666             }
1667
1668             /**************************
1669              * CALCULATE INTERACTIONS *
1670              **************************/
1671
1672             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1673             {
1674
1675             /* REACTION-FIELD ELECTROSTATICS */
1676             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1677
1678             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1679
1680             fscal            = felec;
1681
1682             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1683
1684             /* Calculate temporary vectorial force */
1685             tx               = _mm256_mul_ps(fscal,dx33);
1686             ty               = _mm256_mul_ps(fscal,dy33);
1687             tz               = _mm256_mul_ps(fscal,dz33);
1688
1689             /* Update vectorial force */
1690             fix3             = _mm256_add_ps(fix3,tx);
1691             fiy3             = _mm256_add_ps(fiy3,ty);
1692             fiz3             = _mm256_add_ps(fiz3,tz);
1693
1694             fjx3             = _mm256_add_ps(fjx3,tx);
1695             fjy3             = _mm256_add_ps(fjy3,ty);
1696             fjz3             = _mm256_add_ps(fjz3,tz);
1697
1698             }
1699
1700             fjptrA             = f+j_coord_offsetA;
1701             fjptrB             = f+j_coord_offsetB;
1702             fjptrC             = f+j_coord_offsetC;
1703             fjptrD             = f+j_coord_offsetD;
1704             fjptrE             = f+j_coord_offsetE;
1705             fjptrF             = f+j_coord_offsetF;
1706             fjptrG             = f+j_coord_offsetG;
1707             fjptrH             = f+j_coord_offsetH;
1708
1709             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1710                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1711                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1712
1713             /* Inner loop uses 270 flops */
1714         }
1715
1716         if(jidx<j_index_end)
1717         {
1718
1719             /* Get j neighbor index, and coordinate index */
1720             jnrlistA         = jjnr[jidx];
1721             jnrlistB         = jjnr[jidx+1];
1722             jnrlistC         = jjnr[jidx+2];
1723             jnrlistD         = jjnr[jidx+3];
1724             jnrlistE         = jjnr[jidx+4];
1725             jnrlistF         = jjnr[jidx+5];
1726             jnrlistG         = jjnr[jidx+6];
1727             jnrlistH         = jjnr[jidx+7];
1728             /* Sign of each element will be negative for non-real atoms.
1729              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1730              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1731              */
1732             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1733                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1734                                             
1735             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1736             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1737             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1738             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1739             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1740             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1741             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1742             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1743             j_coord_offsetA  = DIM*jnrA;
1744             j_coord_offsetB  = DIM*jnrB;
1745             j_coord_offsetC  = DIM*jnrC;
1746             j_coord_offsetD  = DIM*jnrD;
1747             j_coord_offsetE  = DIM*jnrE;
1748             j_coord_offsetF  = DIM*jnrF;
1749             j_coord_offsetG  = DIM*jnrG;
1750             j_coord_offsetH  = DIM*jnrH;
1751
1752             /* load j atom coordinates */
1753             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1754                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1755                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1756                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1757                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1758
1759             /* Calculate displacement vector */
1760             dx11             = _mm256_sub_ps(ix1,jx1);
1761             dy11             = _mm256_sub_ps(iy1,jy1);
1762             dz11             = _mm256_sub_ps(iz1,jz1);
1763             dx12             = _mm256_sub_ps(ix1,jx2);
1764             dy12             = _mm256_sub_ps(iy1,jy2);
1765             dz12             = _mm256_sub_ps(iz1,jz2);
1766             dx13             = _mm256_sub_ps(ix1,jx3);
1767             dy13             = _mm256_sub_ps(iy1,jy3);
1768             dz13             = _mm256_sub_ps(iz1,jz3);
1769             dx21             = _mm256_sub_ps(ix2,jx1);
1770             dy21             = _mm256_sub_ps(iy2,jy1);
1771             dz21             = _mm256_sub_ps(iz2,jz1);
1772             dx22             = _mm256_sub_ps(ix2,jx2);
1773             dy22             = _mm256_sub_ps(iy2,jy2);
1774             dz22             = _mm256_sub_ps(iz2,jz2);
1775             dx23             = _mm256_sub_ps(ix2,jx3);
1776             dy23             = _mm256_sub_ps(iy2,jy3);
1777             dz23             = _mm256_sub_ps(iz2,jz3);
1778             dx31             = _mm256_sub_ps(ix3,jx1);
1779             dy31             = _mm256_sub_ps(iy3,jy1);
1780             dz31             = _mm256_sub_ps(iz3,jz1);
1781             dx32             = _mm256_sub_ps(ix3,jx2);
1782             dy32             = _mm256_sub_ps(iy3,jy2);
1783             dz32             = _mm256_sub_ps(iz3,jz2);
1784             dx33             = _mm256_sub_ps(ix3,jx3);
1785             dy33             = _mm256_sub_ps(iy3,jy3);
1786             dz33             = _mm256_sub_ps(iz3,jz3);
1787
1788             /* Calculate squared distance and things based on it */
1789             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1790             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1791             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1792             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1793             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1794             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1795             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1796             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1797             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1798
1799             rinv11           = avx256_invsqrt_f(rsq11);
1800             rinv12           = avx256_invsqrt_f(rsq12);
1801             rinv13           = avx256_invsqrt_f(rsq13);
1802             rinv21           = avx256_invsqrt_f(rsq21);
1803             rinv22           = avx256_invsqrt_f(rsq22);
1804             rinv23           = avx256_invsqrt_f(rsq23);
1805             rinv31           = avx256_invsqrt_f(rsq31);
1806             rinv32           = avx256_invsqrt_f(rsq32);
1807             rinv33           = avx256_invsqrt_f(rsq33);
1808
1809             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1810             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1811             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1812             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1813             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1814             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1815             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1816             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1817             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1818
1819             fjx1             = _mm256_setzero_ps();
1820             fjy1             = _mm256_setzero_ps();
1821             fjz1             = _mm256_setzero_ps();
1822             fjx2             = _mm256_setzero_ps();
1823             fjy2             = _mm256_setzero_ps();
1824             fjz2             = _mm256_setzero_ps();
1825             fjx3             = _mm256_setzero_ps();
1826             fjy3             = _mm256_setzero_ps();
1827             fjz3             = _mm256_setzero_ps();
1828
1829             /**************************
1830              * CALCULATE INTERACTIONS *
1831              **************************/
1832
1833             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1834             {
1835
1836             /* REACTION-FIELD ELECTROSTATICS */
1837             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1838
1839             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1840
1841             fscal            = felec;
1842
1843             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1844
1845             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1846
1847             /* Calculate temporary vectorial force */
1848             tx               = _mm256_mul_ps(fscal,dx11);
1849             ty               = _mm256_mul_ps(fscal,dy11);
1850             tz               = _mm256_mul_ps(fscal,dz11);
1851
1852             /* Update vectorial force */
1853             fix1             = _mm256_add_ps(fix1,tx);
1854             fiy1             = _mm256_add_ps(fiy1,ty);
1855             fiz1             = _mm256_add_ps(fiz1,tz);
1856
1857             fjx1             = _mm256_add_ps(fjx1,tx);
1858             fjy1             = _mm256_add_ps(fjy1,ty);
1859             fjz1             = _mm256_add_ps(fjz1,tz);
1860
1861             }
1862
1863             /**************************
1864              * CALCULATE INTERACTIONS *
1865              **************************/
1866
1867             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1868             {
1869
1870             /* REACTION-FIELD ELECTROSTATICS */
1871             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1872
1873             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1874
1875             fscal            = felec;
1876
1877             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1878
1879             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1880
1881             /* Calculate temporary vectorial force */
1882             tx               = _mm256_mul_ps(fscal,dx12);
1883             ty               = _mm256_mul_ps(fscal,dy12);
1884             tz               = _mm256_mul_ps(fscal,dz12);
1885
1886             /* Update vectorial force */
1887             fix1             = _mm256_add_ps(fix1,tx);
1888             fiy1             = _mm256_add_ps(fiy1,ty);
1889             fiz1             = _mm256_add_ps(fiz1,tz);
1890
1891             fjx2             = _mm256_add_ps(fjx2,tx);
1892             fjy2             = _mm256_add_ps(fjy2,ty);
1893             fjz2             = _mm256_add_ps(fjz2,tz);
1894
1895             }
1896
1897             /**************************
1898              * CALCULATE INTERACTIONS *
1899              **************************/
1900
1901             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1902             {
1903
1904             /* REACTION-FIELD ELECTROSTATICS */
1905             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1906
1907             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1908
1909             fscal            = felec;
1910
1911             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1912
1913             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1914
1915             /* Calculate temporary vectorial force */
1916             tx               = _mm256_mul_ps(fscal,dx13);
1917             ty               = _mm256_mul_ps(fscal,dy13);
1918             tz               = _mm256_mul_ps(fscal,dz13);
1919
1920             /* Update vectorial force */
1921             fix1             = _mm256_add_ps(fix1,tx);
1922             fiy1             = _mm256_add_ps(fiy1,ty);
1923             fiz1             = _mm256_add_ps(fiz1,tz);
1924
1925             fjx3             = _mm256_add_ps(fjx3,tx);
1926             fjy3             = _mm256_add_ps(fjy3,ty);
1927             fjz3             = _mm256_add_ps(fjz3,tz);
1928
1929             }
1930
1931             /**************************
1932              * CALCULATE INTERACTIONS *
1933              **************************/
1934
1935             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1936             {
1937
1938             /* REACTION-FIELD ELECTROSTATICS */
1939             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1940
1941             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1942
1943             fscal            = felec;
1944
1945             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1946
1947             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1948
1949             /* Calculate temporary vectorial force */
1950             tx               = _mm256_mul_ps(fscal,dx21);
1951             ty               = _mm256_mul_ps(fscal,dy21);
1952             tz               = _mm256_mul_ps(fscal,dz21);
1953
1954             /* Update vectorial force */
1955             fix2             = _mm256_add_ps(fix2,tx);
1956             fiy2             = _mm256_add_ps(fiy2,ty);
1957             fiz2             = _mm256_add_ps(fiz2,tz);
1958
1959             fjx1             = _mm256_add_ps(fjx1,tx);
1960             fjy1             = _mm256_add_ps(fjy1,ty);
1961             fjz1             = _mm256_add_ps(fjz1,tz);
1962
1963             }
1964
1965             /**************************
1966              * CALCULATE INTERACTIONS *
1967              **************************/
1968
1969             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1970             {
1971
1972             /* REACTION-FIELD ELECTROSTATICS */
1973             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1974
1975             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1976
1977             fscal            = felec;
1978
1979             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1980
1981             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1982
1983             /* Calculate temporary vectorial force */
1984             tx               = _mm256_mul_ps(fscal,dx22);
1985             ty               = _mm256_mul_ps(fscal,dy22);
1986             tz               = _mm256_mul_ps(fscal,dz22);
1987
1988             /* Update vectorial force */
1989             fix2             = _mm256_add_ps(fix2,tx);
1990             fiy2             = _mm256_add_ps(fiy2,ty);
1991             fiz2             = _mm256_add_ps(fiz2,tz);
1992
1993             fjx2             = _mm256_add_ps(fjx2,tx);
1994             fjy2             = _mm256_add_ps(fjy2,ty);
1995             fjz2             = _mm256_add_ps(fjz2,tz);
1996
1997             }
1998
1999             /**************************
2000              * CALCULATE INTERACTIONS *
2001              **************************/
2002
2003             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2004             {
2005
2006             /* REACTION-FIELD ELECTROSTATICS */
2007             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
2008
2009             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2010
2011             fscal            = felec;
2012
2013             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2014
2015             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2016
2017             /* Calculate temporary vectorial force */
2018             tx               = _mm256_mul_ps(fscal,dx23);
2019             ty               = _mm256_mul_ps(fscal,dy23);
2020             tz               = _mm256_mul_ps(fscal,dz23);
2021
2022             /* Update vectorial force */
2023             fix2             = _mm256_add_ps(fix2,tx);
2024             fiy2             = _mm256_add_ps(fiy2,ty);
2025             fiz2             = _mm256_add_ps(fiz2,tz);
2026
2027             fjx3             = _mm256_add_ps(fjx3,tx);
2028             fjy3             = _mm256_add_ps(fjy3,ty);
2029             fjz3             = _mm256_add_ps(fjz3,tz);
2030
2031             }
2032
2033             /**************************
2034              * CALCULATE INTERACTIONS *
2035              **************************/
2036
2037             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2038             {
2039
2040             /* REACTION-FIELD ELECTROSTATICS */
2041             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
2042
2043             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2044
2045             fscal            = felec;
2046
2047             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2048
2049             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2050
2051             /* Calculate temporary vectorial force */
2052             tx               = _mm256_mul_ps(fscal,dx31);
2053             ty               = _mm256_mul_ps(fscal,dy31);
2054             tz               = _mm256_mul_ps(fscal,dz31);
2055
2056             /* Update vectorial force */
2057             fix3             = _mm256_add_ps(fix3,tx);
2058             fiy3             = _mm256_add_ps(fiy3,ty);
2059             fiz3             = _mm256_add_ps(fiz3,tz);
2060
2061             fjx1             = _mm256_add_ps(fjx1,tx);
2062             fjy1             = _mm256_add_ps(fjy1,ty);
2063             fjz1             = _mm256_add_ps(fjz1,tz);
2064
2065             }
2066
2067             /**************************
2068              * CALCULATE INTERACTIONS *
2069              **************************/
2070
2071             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2072             {
2073
2074             /* REACTION-FIELD ELECTROSTATICS */
2075             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
2076
2077             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2078
2079             fscal            = felec;
2080
2081             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2082
2083             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2084
2085             /* Calculate temporary vectorial force */
2086             tx               = _mm256_mul_ps(fscal,dx32);
2087             ty               = _mm256_mul_ps(fscal,dy32);
2088             tz               = _mm256_mul_ps(fscal,dz32);
2089
2090             /* Update vectorial force */
2091             fix3             = _mm256_add_ps(fix3,tx);
2092             fiy3             = _mm256_add_ps(fiy3,ty);
2093             fiz3             = _mm256_add_ps(fiz3,tz);
2094
2095             fjx2             = _mm256_add_ps(fjx2,tx);
2096             fjy2             = _mm256_add_ps(fjy2,ty);
2097             fjz2             = _mm256_add_ps(fjz2,tz);
2098
2099             }
2100
2101             /**************************
2102              * CALCULATE INTERACTIONS *
2103              **************************/
2104
2105             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2106             {
2107
2108             /* REACTION-FIELD ELECTROSTATICS */
2109             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
2110
2111             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2112
2113             fscal            = felec;
2114
2115             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2116
2117             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2118
2119             /* Calculate temporary vectorial force */
2120             tx               = _mm256_mul_ps(fscal,dx33);
2121             ty               = _mm256_mul_ps(fscal,dy33);
2122             tz               = _mm256_mul_ps(fscal,dz33);
2123
2124             /* Update vectorial force */
2125             fix3             = _mm256_add_ps(fix3,tx);
2126             fiy3             = _mm256_add_ps(fiy3,ty);
2127             fiz3             = _mm256_add_ps(fiz3,tz);
2128
2129             fjx3             = _mm256_add_ps(fjx3,tx);
2130             fjy3             = _mm256_add_ps(fjy3,ty);
2131             fjz3             = _mm256_add_ps(fjz3,tz);
2132
2133             }
2134
2135             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2136             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2137             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2138             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2139             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2140             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2141             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2142             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2143
2144             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2145                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2146                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2147
2148             /* Inner loop uses 270 flops */
2149         }
2150
2151         /* End of innermost loop */
2152
2153         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2154                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
2155
2156         /* Increment number of inner iterations */
2157         inneriter                  += j_index_end - j_index_start;
2158
2159         /* Outer loop uses 18 flops */
2160     }
2161
2162     /* Increment number of outer iterations */
2163     outeriter        += nri;
2164
2165     /* Update outer/inner flops */
2166
2167     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
2168 }