Created SIMD module
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRFCut_VdwNone_GeomW4W4_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            None
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr1;
89     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
99     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     __m256           dummy_mask,cutoff_mask;
112     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
113     __m256           one     = _mm256_set1_ps(1.0);
114     __m256           two     = _mm256_set1_ps(2.0);
115     x                = xx[0];
116     f                = ff[0];
117
118     nri              = nlist->nri;
119     iinr             = nlist->iinr;
120     jindex           = nlist->jindex;
121     jjnr             = nlist->jjnr;
122     shiftidx         = nlist->shift;
123     gid              = nlist->gid;
124     shiftvec         = fr->shift_vec[0];
125     fshift           = fr->fshift[0];
126     facel            = _mm256_set1_ps(fr->epsfac);
127     charge           = mdatoms->chargeA;
128     krf              = _mm256_set1_ps(fr->ic->k_rf);
129     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
130     crf              = _mm256_set1_ps(fr->ic->c_rf);
131
132     /* Setup water-specific parameters */
133     inr              = nlist->iinr[0];
134     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
135     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
136     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
137
138     jq1              = _mm256_set1_ps(charge[inr+1]);
139     jq2              = _mm256_set1_ps(charge[inr+2]);
140     jq3              = _mm256_set1_ps(charge[inr+3]);
141     qq11             = _mm256_mul_ps(iq1,jq1);
142     qq12             = _mm256_mul_ps(iq1,jq2);
143     qq13             = _mm256_mul_ps(iq1,jq3);
144     qq21             = _mm256_mul_ps(iq2,jq1);
145     qq22             = _mm256_mul_ps(iq2,jq2);
146     qq23             = _mm256_mul_ps(iq2,jq3);
147     qq31             = _mm256_mul_ps(iq3,jq1);
148     qq32             = _mm256_mul_ps(iq3,jq2);
149     qq33             = _mm256_mul_ps(iq3,jq3);
150
151     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
152     rcutoff_scalar   = fr->rcoulomb;
153     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
154     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
155
156     /* Avoid stupid compiler warnings */
157     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
158     j_coord_offsetA = 0;
159     j_coord_offsetB = 0;
160     j_coord_offsetC = 0;
161     j_coord_offsetD = 0;
162     j_coord_offsetE = 0;
163     j_coord_offsetF = 0;
164     j_coord_offsetG = 0;
165     j_coord_offsetH = 0;
166
167     outeriter        = 0;
168     inneriter        = 0;
169
170     for(iidx=0;iidx<4*DIM;iidx++)
171     {
172         scratch[iidx] = 0.0;
173     }
174
175     /* Start outer loop over neighborlists */
176     for(iidx=0; iidx<nri; iidx++)
177     {
178         /* Load shift vector for this list */
179         i_shift_offset   = DIM*shiftidx[iidx];
180
181         /* Load limits for loop over neighbors */
182         j_index_start    = jindex[iidx];
183         j_index_end      = jindex[iidx+1];
184
185         /* Get outer coordinate index */
186         inr              = iinr[iidx];
187         i_coord_offset   = DIM*inr;
188
189         /* Load i particle coords and add shift vector */
190         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
191                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
192
193         fix1             = _mm256_setzero_ps();
194         fiy1             = _mm256_setzero_ps();
195         fiz1             = _mm256_setzero_ps();
196         fix2             = _mm256_setzero_ps();
197         fiy2             = _mm256_setzero_ps();
198         fiz2             = _mm256_setzero_ps();
199         fix3             = _mm256_setzero_ps();
200         fiy3             = _mm256_setzero_ps();
201         fiz3             = _mm256_setzero_ps();
202
203         /* Reset potential sums */
204         velecsum         = _mm256_setzero_ps();
205
206         /* Start inner kernel loop */
207         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
208         {
209
210             /* Get j neighbor index, and coordinate index */
211             jnrA             = jjnr[jidx];
212             jnrB             = jjnr[jidx+1];
213             jnrC             = jjnr[jidx+2];
214             jnrD             = jjnr[jidx+3];
215             jnrE             = jjnr[jidx+4];
216             jnrF             = jjnr[jidx+5];
217             jnrG             = jjnr[jidx+6];
218             jnrH             = jjnr[jidx+7];
219             j_coord_offsetA  = DIM*jnrA;
220             j_coord_offsetB  = DIM*jnrB;
221             j_coord_offsetC  = DIM*jnrC;
222             j_coord_offsetD  = DIM*jnrD;
223             j_coord_offsetE  = DIM*jnrE;
224             j_coord_offsetF  = DIM*jnrF;
225             j_coord_offsetG  = DIM*jnrG;
226             j_coord_offsetH  = DIM*jnrH;
227
228             /* load j atom coordinates */
229             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
230                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
231                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
232                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
233                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
234
235             /* Calculate displacement vector */
236             dx11             = _mm256_sub_ps(ix1,jx1);
237             dy11             = _mm256_sub_ps(iy1,jy1);
238             dz11             = _mm256_sub_ps(iz1,jz1);
239             dx12             = _mm256_sub_ps(ix1,jx2);
240             dy12             = _mm256_sub_ps(iy1,jy2);
241             dz12             = _mm256_sub_ps(iz1,jz2);
242             dx13             = _mm256_sub_ps(ix1,jx3);
243             dy13             = _mm256_sub_ps(iy1,jy3);
244             dz13             = _mm256_sub_ps(iz1,jz3);
245             dx21             = _mm256_sub_ps(ix2,jx1);
246             dy21             = _mm256_sub_ps(iy2,jy1);
247             dz21             = _mm256_sub_ps(iz2,jz1);
248             dx22             = _mm256_sub_ps(ix2,jx2);
249             dy22             = _mm256_sub_ps(iy2,jy2);
250             dz22             = _mm256_sub_ps(iz2,jz2);
251             dx23             = _mm256_sub_ps(ix2,jx3);
252             dy23             = _mm256_sub_ps(iy2,jy3);
253             dz23             = _mm256_sub_ps(iz2,jz3);
254             dx31             = _mm256_sub_ps(ix3,jx1);
255             dy31             = _mm256_sub_ps(iy3,jy1);
256             dz31             = _mm256_sub_ps(iz3,jz1);
257             dx32             = _mm256_sub_ps(ix3,jx2);
258             dy32             = _mm256_sub_ps(iy3,jy2);
259             dz32             = _mm256_sub_ps(iz3,jz2);
260             dx33             = _mm256_sub_ps(ix3,jx3);
261             dy33             = _mm256_sub_ps(iy3,jy3);
262             dz33             = _mm256_sub_ps(iz3,jz3);
263
264             /* Calculate squared distance and things based on it */
265             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
266             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
267             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
268             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
269             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
270             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
271             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
272             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
273             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
274
275             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
276             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
277             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
278             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
279             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
280             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
281             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
282             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
283             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
284
285             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
286             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
287             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
288             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
289             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
290             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
291             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
292             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
293             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
294
295             fjx1             = _mm256_setzero_ps();
296             fjy1             = _mm256_setzero_ps();
297             fjz1             = _mm256_setzero_ps();
298             fjx2             = _mm256_setzero_ps();
299             fjy2             = _mm256_setzero_ps();
300             fjz2             = _mm256_setzero_ps();
301             fjx3             = _mm256_setzero_ps();
302             fjy3             = _mm256_setzero_ps();
303             fjz3             = _mm256_setzero_ps();
304
305             /**************************
306              * CALCULATE INTERACTIONS *
307              **************************/
308
309             if (gmx_mm256_any_lt(rsq11,rcutoff2))
310             {
311
312             /* REACTION-FIELD ELECTROSTATICS */
313             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
314             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
315
316             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
317
318             /* Update potential sum for this i atom from the interaction with this j atom. */
319             velec            = _mm256_and_ps(velec,cutoff_mask);
320             velecsum         = _mm256_add_ps(velecsum,velec);
321
322             fscal            = felec;
323
324             fscal            = _mm256_and_ps(fscal,cutoff_mask);
325
326             /* Calculate temporary vectorial force */
327             tx               = _mm256_mul_ps(fscal,dx11);
328             ty               = _mm256_mul_ps(fscal,dy11);
329             tz               = _mm256_mul_ps(fscal,dz11);
330
331             /* Update vectorial force */
332             fix1             = _mm256_add_ps(fix1,tx);
333             fiy1             = _mm256_add_ps(fiy1,ty);
334             fiz1             = _mm256_add_ps(fiz1,tz);
335
336             fjx1             = _mm256_add_ps(fjx1,tx);
337             fjy1             = _mm256_add_ps(fjy1,ty);
338             fjz1             = _mm256_add_ps(fjz1,tz);
339
340             }
341
342             /**************************
343              * CALCULATE INTERACTIONS *
344              **************************/
345
346             if (gmx_mm256_any_lt(rsq12,rcutoff2))
347             {
348
349             /* REACTION-FIELD ELECTROSTATICS */
350             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
351             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
352
353             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
354
355             /* Update potential sum for this i atom from the interaction with this j atom. */
356             velec            = _mm256_and_ps(velec,cutoff_mask);
357             velecsum         = _mm256_add_ps(velecsum,velec);
358
359             fscal            = felec;
360
361             fscal            = _mm256_and_ps(fscal,cutoff_mask);
362
363             /* Calculate temporary vectorial force */
364             tx               = _mm256_mul_ps(fscal,dx12);
365             ty               = _mm256_mul_ps(fscal,dy12);
366             tz               = _mm256_mul_ps(fscal,dz12);
367
368             /* Update vectorial force */
369             fix1             = _mm256_add_ps(fix1,tx);
370             fiy1             = _mm256_add_ps(fiy1,ty);
371             fiz1             = _mm256_add_ps(fiz1,tz);
372
373             fjx2             = _mm256_add_ps(fjx2,tx);
374             fjy2             = _mm256_add_ps(fjy2,ty);
375             fjz2             = _mm256_add_ps(fjz2,tz);
376
377             }
378
379             /**************************
380              * CALCULATE INTERACTIONS *
381              **************************/
382
383             if (gmx_mm256_any_lt(rsq13,rcutoff2))
384             {
385
386             /* REACTION-FIELD ELECTROSTATICS */
387             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
388             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
389
390             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
391
392             /* Update potential sum for this i atom from the interaction with this j atom. */
393             velec            = _mm256_and_ps(velec,cutoff_mask);
394             velecsum         = _mm256_add_ps(velecsum,velec);
395
396             fscal            = felec;
397
398             fscal            = _mm256_and_ps(fscal,cutoff_mask);
399
400             /* Calculate temporary vectorial force */
401             tx               = _mm256_mul_ps(fscal,dx13);
402             ty               = _mm256_mul_ps(fscal,dy13);
403             tz               = _mm256_mul_ps(fscal,dz13);
404
405             /* Update vectorial force */
406             fix1             = _mm256_add_ps(fix1,tx);
407             fiy1             = _mm256_add_ps(fiy1,ty);
408             fiz1             = _mm256_add_ps(fiz1,tz);
409
410             fjx3             = _mm256_add_ps(fjx3,tx);
411             fjy3             = _mm256_add_ps(fjy3,ty);
412             fjz3             = _mm256_add_ps(fjz3,tz);
413
414             }
415
416             /**************************
417              * CALCULATE INTERACTIONS *
418              **************************/
419
420             if (gmx_mm256_any_lt(rsq21,rcutoff2))
421             {
422
423             /* REACTION-FIELD ELECTROSTATICS */
424             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
425             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
426
427             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
428
429             /* Update potential sum for this i atom from the interaction with this j atom. */
430             velec            = _mm256_and_ps(velec,cutoff_mask);
431             velecsum         = _mm256_add_ps(velecsum,velec);
432
433             fscal            = felec;
434
435             fscal            = _mm256_and_ps(fscal,cutoff_mask);
436
437             /* Calculate temporary vectorial force */
438             tx               = _mm256_mul_ps(fscal,dx21);
439             ty               = _mm256_mul_ps(fscal,dy21);
440             tz               = _mm256_mul_ps(fscal,dz21);
441
442             /* Update vectorial force */
443             fix2             = _mm256_add_ps(fix2,tx);
444             fiy2             = _mm256_add_ps(fiy2,ty);
445             fiz2             = _mm256_add_ps(fiz2,tz);
446
447             fjx1             = _mm256_add_ps(fjx1,tx);
448             fjy1             = _mm256_add_ps(fjy1,ty);
449             fjz1             = _mm256_add_ps(fjz1,tz);
450
451             }
452
453             /**************************
454              * CALCULATE INTERACTIONS *
455              **************************/
456
457             if (gmx_mm256_any_lt(rsq22,rcutoff2))
458             {
459
460             /* REACTION-FIELD ELECTROSTATICS */
461             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
462             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
463
464             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
465
466             /* Update potential sum for this i atom from the interaction with this j atom. */
467             velec            = _mm256_and_ps(velec,cutoff_mask);
468             velecsum         = _mm256_add_ps(velecsum,velec);
469
470             fscal            = felec;
471
472             fscal            = _mm256_and_ps(fscal,cutoff_mask);
473
474             /* Calculate temporary vectorial force */
475             tx               = _mm256_mul_ps(fscal,dx22);
476             ty               = _mm256_mul_ps(fscal,dy22);
477             tz               = _mm256_mul_ps(fscal,dz22);
478
479             /* Update vectorial force */
480             fix2             = _mm256_add_ps(fix2,tx);
481             fiy2             = _mm256_add_ps(fiy2,ty);
482             fiz2             = _mm256_add_ps(fiz2,tz);
483
484             fjx2             = _mm256_add_ps(fjx2,tx);
485             fjy2             = _mm256_add_ps(fjy2,ty);
486             fjz2             = _mm256_add_ps(fjz2,tz);
487
488             }
489
490             /**************************
491              * CALCULATE INTERACTIONS *
492              **************************/
493
494             if (gmx_mm256_any_lt(rsq23,rcutoff2))
495             {
496
497             /* REACTION-FIELD ELECTROSTATICS */
498             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
499             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
500
501             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
502
503             /* Update potential sum for this i atom from the interaction with this j atom. */
504             velec            = _mm256_and_ps(velec,cutoff_mask);
505             velecsum         = _mm256_add_ps(velecsum,velec);
506
507             fscal            = felec;
508
509             fscal            = _mm256_and_ps(fscal,cutoff_mask);
510
511             /* Calculate temporary vectorial force */
512             tx               = _mm256_mul_ps(fscal,dx23);
513             ty               = _mm256_mul_ps(fscal,dy23);
514             tz               = _mm256_mul_ps(fscal,dz23);
515
516             /* Update vectorial force */
517             fix2             = _mm256_add_ps(fix2,tx);
518             fiy2             = _mm256_add_ps(fiy2,ty);
519             fiz2             = _mm256_add_ps(fiz2,tz);
520
521             fjx3             = _mm256_add_ps(fjx3,tx);
522             fjy3             = _mm256_add_ps(fjy3,ty);
523             fjz3             = _mm256_add_ps(fjz3,tz);
524
525             }
526
527             /**************************
528              * CALCULATE INTERACTIONS *
529              **************************/
530
531             if (gmx_mm256_any_lt(rsq31,rcutoff2))
532             {
533
534             /* REACTION-FIELD ELECTROSTATICS */
535             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
536             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
537
538             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
539
540             /* Update potential sum for this i atom from the interaction with this j atom. */
541             velec            = _mm256_and_ps(velec,cutoff_mask);
542             velecsum         = _mm256_add_ps(velecsum,velec);
543
544             fscal            = felec;
545
546             fscal            = _mm256_and_ps(fscal,cutoff_mask);
547
548             /* Calculate temporary vectorial force */
549             tx               = _mm256_mul_ps(fscal,dx31);
550             ty               = _mm256_mul_ps(fscal,dy31);
551             tz               = _mm256_mul_ps(fscal,dz31);
552
553             /* Update vectorial force */
554             fix3             = _mm256_add_ps(fix3,tx);
555             fiy3             = _mm256_add_ps(fiy3,ty);
556             fiz3             = _mm256_add_ps(fiz3,tz);
557
558             fjx1             = _mm256_add_ps(fjx1,tx);
559             fjy1             = _mm256_add_ps(fjy1,ty);
560             fjz1             = _mm256_add_ps(fjz1,tz);
561
562             }
563
564             /**************************
565              * CALCULATE INTERACTIONS *
566              **************************/
567
568             if (gmx_mm256_any_lt(rsq32,rcutoff2))
569             {
570
571             /* REACTION-FIELD ELECTROSTATICS */
572             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
573             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
574
575             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
576
577             /* Update potential sum for this i atom from the interaction with this j atom. */
578             velec            = _mm256_and_ps(velec,cutoff_mask);
579             velecsum         = _mm256_add_ps(velecsum,velec);
580
581             fscal            = felec;
582
583             fscal            = _mm256_and_ps(fscal,cutoff_mask);
584
585             /* Calculate temporary vectorial force */
586             tx               = _mm256_mul_ps(fscal,dx32);
587             ty               = _mm256_mul_ps(fscal,dy32);
588             tz               = _mm256_mul_ps(fscal,dz32);
589
590             /* Update vectorial force */
591             fix3             = _mm256_add_ps(fix3,tx);
592             fiy3             = _mm256_add_ps(fiy3,ty);
593             fiz3             = _mm256_add_ps(fiz3,tz);
594
595             fjx2             = _mm256_add_ps(fjx2,tx);
596             fjy2             = _mm256_add_ps(fjy2,ty);
597             fjz2             = _mm256_add_ps(fjz2,tz);
598
599             }
600
601             /**************************
602              * CALCULATE INTERACTIONS *
603              **************************/
604
605             if (gmx_mm256_any_lt(rsq33,rcutoff2))
606             {
607
608             /* REACTION-FIELD ELECTROSTATICS */
609             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
610             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
611
612             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
613
614             /* Update potential sum for this i atom from the interaction with this j atom. */
615             velec            = _mm256_and_ps(velec,cutoff_mask);
616             velecsum         = _mm256_add_ps(velecsum,velec);
617
618             fscal            = felec;
619
620             fscal            = _mm256_and_ps(fscal,cutoff_mask);
621
622             /* Calculate temporary vectorial force */
623             tx               = _mm256_mul_ps(fscal,dx33);
624             ty               = _mm256_mul_ps(fscal,dy33);
625             tz               = _mm256_mul_ps(fscal,dz33);
626
627             /* Update vectorial force */
628             fix3             = _mm256_add_ps(fix3,tx);
629             fiy3             = _mm256_add_ps(fiy3,ty);
630             fiz3             = _mm256_add_ps(fiz3,tz);
631
632             fjx3             = _mm256_add_ps(fjx3,tx);
633             fjy3             = _mm256_add_ps(fjy3,ty);
634             fjz3             = _mm256_add_ps(fjz3,tz);
635
636             }
637
638             fjptrA             = f+j_coord_offsetA;
639             fjptrB             = f+j_coord_offsetB;
640             fjptrC             = f+j_coord_offsetC;
641             fjptrD             = f+j_coord_offsetD;
642             fjptrE             = f+j_coord_offsetE;
643             fjptrF             = f+j_coord_offsetF;
644             fjptrG             = f+j_coord_offsetG;
645             fjptrH             = f+j_coord_offsetH;
646
647             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
648                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
649                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
650
651             /* Inner loop uses 324 flops */
652         }
653
654         if(jidx<j_index_end)
655         {
656
657             /* Get j neighbor index, and coordinate index */
658             jnrlistA         = jjnr[jidx];
659             jnrlistB         = jjnr[jidx+1];
660             jnrlistC         = jjnr[jidx+2];
661             jnrlistD         = jjnr[jidx+3];
662             jnrlistE         = jjnr[jidx+4];
663             jnrlistF         = jjnr[jidx+5];
664             jnrlistG         = jjnr[jidx+6];
665             jnrlistH         = jjnr[jidx+7];
666             /* Sign of each element will be negative for non-real atoms.
667              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
668              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
669              */
670             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
671                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
672                                             
673             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
674             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
675             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
676             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
677             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
678             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
679             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
680             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
681             j_coord_offsetA  = DIM*jnrA;
682             j_coord_offsetB  = DIM*jnrB;
683             j_coord_offsetC  = DIM*jnrC;
684             j_coord_offsetD  = DIM*jnrD;
685             j_coord_offsetE  = DIM*jnrE;
686             j_coord_offsetF  = DIM*jnrF;
687             j_coord_offsetG  = DIM*jnrG;
688             j_coord_offsetH  = DIM*jnrH;
689
690             /* load j atom coordinates */
691             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
692                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
693                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
694                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
695                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
696
697             /* Calculate displacement vector */
698             dx11             = _mm256_sub_ps(ix1,jx1);
699             dy11             = _mm256_sub_ps(iy1,jy1);
700             dz11             = _mm256_sub_ps(iz1,jz1);
701             dx12             = _mm256_sub_ps(ix1,jx2);
702             dy12             = _mm256_sub_ps(iy1,jy2);
703             dz12             = _mm256_sub_ps(iz1,jz2);
704             dx13             = _mm256_sub_ps(ix1,jx3);
705             dy13             = _mm256_sub_ps(iy1,jy3);
706             dz13             = _mm256_sub_ps(iz1,jz3);
707             dx21             = _mm256_sub_ps(ix2,jx1);
708             dy21             = _mm256_sub_ps(iy2,jy1);
709             dz21             = _mm256_sub_ps(iz2,jz1);
710             dx22             = _mm256_sub_ps(ix2,jx2);
711             dy22             = _mm256_sub_ps(iy2,jy2);
712             dz22             = _mm256_sub_ps(iz2,jz2);
713             dx23             = _mm256_sub_ps(ix2,jx3);
714             dy23             = _mm256_sub_ps(iy2,jy3);
715             dz23             = _mm256_sub_ps(iz2,jz3);
716             dx31             = _mm256_sub_ps(ix3,jx1);
717             dy31             = _mm256_sub_ps(iy3,jy1);
718             dz31             = _mm256_sub_ps(iz3,jz1);
719             dx32             = _mm256_sub_ps(ix3,jx2);
720             dy32             = _mm256_sub_ps(iy3,jy2);
721             dz32             = _mm256_sub_ps(iz3,jz2);
722             dx33             = _mm256_sub_ps(ix3,jx3);
723             dy33             = _mm256_sub_ps(iy3,jy3);
724             dz33             = _mm256_sub_ps(iz3,jz3);
725
726             /* Calculate squared distance and things based on it */
727             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
728             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
729             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
730             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
731             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
732             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
733             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
734             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
735             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
736
737             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
738             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
739             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
740             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
741             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
742             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
743             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
744             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
745             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
746
747             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
748             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
749             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
750             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
751             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
752             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
753             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
754             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
755             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
756
757             fjx1             = _mm256_setzero_ps();
758             fjy1             = _mm256_setzero_ps();
759             fjz1             = _mm256_setzero_ps();
760             fjx2             = _mm256_setzero_ps();
761             fjy2             = _mm256_setzero_ps();
762             fjz2             = _mm256_setzero_ps();
763             fjx3             = _mm256_setzero_ps();
764             fjy3             = _mm256_setzero_ps();
765             fjz3             = _mm256_setzero_ps();
766
767             /**************************
768              * CALCULATE INTERACTIONS *
769              **************************/
770
771             if (gmx_mm256_any_lt(rsq11,rcutoff2))
772             {
773
774             /* REACTION-FIELD ELECTROSTATICS */
775             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
776             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
777
778             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
779
780             /* Update potential sum for this i atom from the interaction with this j atom. */
781             velec            = _mm256_and_ps(velec,cutoff_mask);
782             velec            = _mm256_andnot_ps(dummy_mask,velec);
783             velecsum         = _mm256_add_ps(velecsum,velec);
784
785             fscal            = felec;
786
787             fscal            = _mm256_and_ps(fscal,cutoff_mask);
788
789             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
790
791             /* Calculate temporary vectorial force */
792             tx               = _mm256_mul_ps(fscal,dx11);
793             ty               = _mm256_mul_ps(fscal,dy11);
794             tz               = _mm256_mul_ps(fscal,dz11);
795
796             /* Update vectorial force */
797             fix1             = _mm256_add_ps(fix1,tx);
798             fiy1             = _mm256_add_ps(fiy1,ty);
799             fiz1             = _mm256_add_ps(fiz1,tz);
800
801             fjx1             = _mm256_add_ps(fjx1,tx);
802             fjy1             = _mm256_add_ps(fjy1,ty);
803             fjz1             = _mm256_add_ps(fjz1,tz);
804
805             }
806
807             /**************************
808              * CALCULATE INTERACTIONS *
809              **************************/
810
811             if (gmx_mm256_any_lt(rsq12,rcutoff2))
812             {
813
814             /* REACTION-FIELD ELECTROSTATICS */
815             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
816             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
817
818             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
819
820             /* Update potential sum for this i atom from the interaction with this j atom. */
821             velec            = _mm256_and_ps(velec,cutoff_mask);
822             velec            = _mm256_andnot_ps(dummy_mask,velec);
823             velecsum         = _mm256_add_ps(velecsum,velec);
824
825             fscal            = felec;
826
827             fscal            = _mm256_and_ps(fscal,cutoff_mask);
828
829             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
830
831             /* Calculate temporary vectorial force */
832             tx               = _mm256_mul_ps(fscal,dx12);
833             ty               = _mm256_mul_ps(fscal,dy12);
834             tz               = _mm256_mul_ps(fscal,dz12);
835
836             /* Update vectorial force */
837             fix1             = _mm256_add_ps(fix1,tx);
838             fiy1             = _mm256_add_ps(fiy1,ty);
839             fiz1             = _mm256_add_ps(fiz1,tz);
840
841             fjx2             = _mm256_add_ps(fjx2,tx);
842             fjy2             = _mm256_add_ps(fjy2,ty);
843             fjz2             = _mm256_add_ps(fjz2,tz);
844
845             }
846
847             /**************************
848              * CALCULATE INTERACTIONS *
849              **************************/
850
851             if (gmx_mm256_any_lt(rsq13,rcutoff2))
852             {
853
854             /* REACTION-FIELD ELECTROSTATICS */
855             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
856             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
857
858             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
859
860             /* Update potential sum for this i atom from the interaction with this j atom. */
861             velec            = _mm256_and_ps(velec,cutoff_mask);
862             velec            = _mm256_andnot_ps(dummy_mask,velec);
863             velecsum         = _mm256_add_ps(velecsum,velec);
864
865             fscal            = felec;
866
867             fscal            = _mm256_and_ps(fscal,cutoff_mask);
868
869             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
870
871             /* Calculate temporary vectorial force */
872             tx               = _mm256_mul_ps(fscal,dx13);
873             ty               = _mm256_mul_ps(fscal,dy13);
874             tz               = _mm256_mul_ps(fscal,dz13);
875
876             /* Update vectorial force */
877             fix1             = _mm256_add_ps(fix1,tx);
878             fiy1             = _mm256_add_ps(fiy1,ty);
879             fiz1             = _mm256_add_ps(fiz1,tz);
880
881             fjx3             = _mm256_add_ps(fjx3,tx);
882             fjy3             = _mm256_add_ps(fjy3,ty);
883             fjz3             = _mm256_add_ps(fjz3,tz);
884
885             }
886
887             /**************************
888              * CALCULATE INTERACTIONS *
889              **************************/
890
891             if (gmx_mm256_any_lt(rsq21,rcutoff2))
892             {
893
894             /* REACTION-FIELD ELECTROSTATICS */
895             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
896             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
897
898             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
899
900             /* Update potential sum for this i atom from the interaction with this j atom. */
901             velec            = _mm256_and_ps(velec,cutoff_mask);
902             velec            = _mm256_andnot_ps(dummy_mask,velec);
903             velecsum         = _mm256_add_ps(velecsum,velec);
904
905             fscal            = felec;
906
907             fscal            = _mm256_and_ps(fscal,cutoff_mask);
908
909             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
910
911             /* Calculate temporary vectorial force */
912             tx               = _mm256_mul_ps(fscal,dx21);
913             ty               = _mm256_mul_ps(fscal,dy21);
914             tz               = _mm256_mul_ps(fscal,dz21);
915
916             /* Update vectorial force */
917             fix2             = _mm256_add_ps(fix2,tx);
918             fiy2             = _mm256_add_ps(fiy2,ty);
919             fiz2             = _mm256_add_ps(fiz2,tz);
920
921             fjx1             = _mm256_add_ps(fjx1,tx);
922             fjy1             = _mm256_add_ps(fjy1,ty);
923             fjz1             = _mm256_add_ps(fjz1,tz);
924
925             }
926
927             /**************************
928              * CALCULATE INTERACTIONS *
929              **************************/
930
931             if (gmx_mm256_any_lt(rsq22,rcutoff2))
932             {
933
934             /* REACTION-FIELD ELECTROSTATICS */
935             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
936             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
937
938             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
939
940             /* Update potential sum for this i atom from the interaction with this j atom. */
941             velec            = _mm256_and_ps(velec,cutoff_mask);
942             velec            = _mm256_andnot_ps(dummy_mask,velec);
943             velecsum         = _mm256_add_ps(velecsum,velec);
944
945             fscal            = felec;
946
947             fscal            = _mm256_and_ps(fscal,cutoff_mask);
948
949             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
950
951             /* Calculate temporary vectorial force */
952             tx               = _mm256_mul_ps(fscal,dx22);
953             ty               = _mm256_mul_ps(fscal,dy22);
954             tz               = _mm256_mul_ps(fscal,dz22);
955
956             /* Update vectorial force */
957             fix2             = _mm256_add_ps(fix2,tx);
958             fiy2             = _mm256_add_ps(fiy2,ty);
959             fiz2             = _mm256_add_ps(fiz2,tz);
960
961             fjx2             = _mm256_add_ps(fjx2,tx);
962             fjy2             = _mm256_add_ps(fjy2,ty);
963             fjz2             = _mm256_add_ps(fjz2,tz);
964
965             }
966
967             /**************************
968              * CALCULATE INTERACTIONS *
969              **************************/
970
971             if (gmx_mm256_any_lt(rsq23,rcutoff2))
972             {
973
974             /* REACTION-FIELD ELECTROSTATICS */
975             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
976             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
977
978             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
979
980             /* Update potential sum for this i atom from the interaction with this j atom. */
981             velec            = _mm256_and_ps(velec,cutoff_mask);
982             velec            = _mm256_andnot_ps(dummy_mask,velec);
983             velecsum         = _mm256_add_ps(velecsum,velec);
984
985             fscal            = felec;
986
987             fscal            = _mm256_and_ps(fscal,cutoff_mask);
988
989             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
990
991             /* Calculate temporary vectorial force */
992             tx               = _mm256_mul_ps(fscal,dx23);
993             ty               = _mm256_mul_ps(fscal,dy23);
994             tz               = _mm256_mul_ps(fscal,dz23);
995
996             /* Update vectorial force */
997             fix2             = _mm256_add_ps(fix2,tx);
998             fiy2             = _mm256_add_ps(fiy2,ty);
999             fiz2             = _mm256_add_ps(fiz2,tz);
1000
1001             fjx3             = _mm256_add_ps(fjx3,tx);
1002             fjy3             = _mm256_add_ps(fjy3,ty);
1003             fjz3             = _mm256_add_ps(fjz3,tz);
1004
1005             }
1006
1007             /**************************
1008              * CALCULATE INTERACTIONS *
1009              **************************/
1010
1011             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1012             {
1013
1014             /* REACTION-FIELD ELECTROSTATICS */
1015             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
1016             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1017
1018             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1019
1020             /* Update potential sum for this i atom from the interaction with this j atom. */
1021             velec            = _mm256_and_ps(velec,cutoff_mask);
1022             velec            = _mm256_andnot_ps(dummy_mask,velec);
1023             velecsum         = _mm256_add_ps(velecsum,velec);
1024
1025             fscal            = felec;
1026
1027             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1028
1029             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1030
1031             /* Calculate temporary vectorial force */
1032             tx               = _mm256_mul_ps(fscal,dx31);
1033             ty               = _mm256_mul_ps(fscal,dy31);
1034             tz               = _mm256_mul_ps(fscal,dz31);
1035
1036             /* Update vectorial force */
1037             fix3             = _mm256_add_ps(fix3,tx);
1038             fiy3             = _mm256_add_ps(fiy3,ty);
1039             fiz3             = _mm256_add_ps(fiz3,tz);
1040
1041             fjx1             = _mm256_add_ps(fjx1,tx);
1042             fjy1             = _mm256_add_ps(fjy1,ty);
1043             fjz1             = _mm256_add_ps(fjz1,tz);
1044
1045             }
1046
1047             /**************************
1048              * CALCULATE INTERACTIONS *
1049              **************************/
1050
1051             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1052             {
1053
1054             /* REACTION-FIELD ELECTROSTATICS */
1055             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
1056             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1057
1058             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1059
1060             /* Update potential sum for this i atom from the interaction with this j atom. */
1061             velec            = _mm256_and_ps(velec,cutoff_mask);
1062             velec            = _mm256_andnot_ps(dummy_mask,velec);
1063             velecsum         = _mm256_add_ps(velecsum,velec);
1064
1065             fscal            = felec;
1066
1067             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1068
1069             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1070
1071             /* Calculate temporary vectorial force */
1072             tx               = _mm256_mul_ps(fscal,dx32);
1073             ty               = _mm256_mul_ps(fscal,dy32);
1074             tz               = _mm256_mul_ps(fscal,dz32);
1075
1076             /* Update vectorial force */
1077             fix3             = _mm256_add_ps(fix3,tx);
1078             fiy3             = _mm256_add_ps(fiy3,ty);
1079             fiz3             = _mm256_add_ps(fiz3,tz);
1080
1081             fjx2             = _mm256_add_ps(fjx2,tx);
1082             fjy2             = _mm256_add_ps(fjy2,ty);
1083             fjz2             = _mm256_add_ps(fjz2,tz);
1084
1085             }
1086
1087             /**************************
1088              * CALCULATE INTERACTIONS *
1089              **************************/
1090
1091             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1092             {
1093
1094             /* REACTION-FIELD ELECTROSTATICS */
1095             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
1096             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1097
1098             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1099
1100             /* Update potential sum for this i atom from the interaction with this j atom. */
1101             velec            = _mm256_and_ps(velec,cutoff_mask);
1102             velec            = _mm256_andnot_ps(dummy_mask,velec);
1103             velecsum         = _mm256_add_ps(velecsum,velec);
1104
1105             fscal            = felec;
1106
1107             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1108
1109             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1110
1111             /* Calculate temporary vectorial force */
1112             tx               = _mm256_mul_ps(fscal,dx33);
1113             ty               = _mm256_mul_ps(fscal,dy33);
1114             tz               = _mm256_mul_ps(fscal,dz33);
1115
1116             /* Update vectorial force */
1117             fix3             = _mm256_add_ps(fix3,tx);
1118             fiy3             = _mm256_add_ps(fiy3,ty);
1119             fiz3             = _mm256_add_ps(fiz3,tz);
1120
1121             fjx3             = _mm256_add_ps(fjx3,tx);
1122             fjy3             = _mm256_add_ps(fjy3,ty);
1123             fjz3             = _mm256_add_ps(fjz3,tz);
1124
1125             }
1126
1127             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1128             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1129             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1130             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1131             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1132             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1133             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1134             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1135
1136             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1137                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1138                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1139
1140             /* Inner loop uses 324 flops */
1141         }
1142
1143         /* End of innermost loop */
1144
1145         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1146                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
1147
1148         ggid                        = gid[iidx];
1149         /* Update potential energies */
1150         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1151
1152         /* Increment number of inner iterations */
1153         inneriter                  += j_index_end - j_index_start;
1154
1155         /* Outer loop uses 19 flops */
1156     }
1157
1158     /* Increment number of outer iterations */
1159     outeriter        += nri;
1160
1161     /* Update outer/inner flops */
1162
1163     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*324);
1164 }
1165 /*
1166  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single
1167  * Electrostatics interaction: ReactionField
1168  * VdW interaction:            None
1169  * Geometry:                   Water4-Water4
1170  * Calculate force/pot:        Force
1171  */
1172 void
1173 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single
1174                     (t_nblist                    * gmx_restrict       nlist,
1175                      rvec                        * gmx_restrict          xx,
1176                      rvec                        * gmx_restrict          ff,
1177                      t_forcerec                  * gmx_restrict          fr,
1178                      t_mdatoms                   * gmx_restrict     mdatoms,
1179                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1180                      t_nrnb                      * gmx_restrict        nrnb)
1181 {
1182     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1183      * just 0 for non-waters.
1184      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1185      * jnr indices corresponding to data put in the four positions in the SIMD register.
1186      */
1187     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1188     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1189     int              jnrA,jnrB,jnrC,jnrD;
1190     int              jnrE,jnrF,jnrG,jnrH;
1191     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1192     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1193     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1194     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1195     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1196     real             rcutoff_scalar;
1197     real             *shiftvec,*fshift,*x,*f;
1198     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1199     real             scratch[4*DIM];
1200     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1201     real *           vdwioffsetptr1;
1202     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1203     real *           vdwioffsetptr2;
1204     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1205     real *           vdwioffsetptr3;
1206     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1207     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1208     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1209     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1210     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1211     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1212     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1213     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1214     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1215     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1216     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1217     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1218     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1219     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1220     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1221     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1222     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1223     real             *charge;
1224     __m256           dummy_mask,cutoff_mask;
1225     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1226     __m256           one     = _mm256_set1_ps(1.0);
1227     __m256           two     = _mm256_set1_ps(2.0);
1228     x                = xx[0];
1229     f                = ff[0];
1230
1231     nri              = nlist->nri;
1232     iinr             = nlist->iinr;
1233     jindex           = nlist->jindex;
1234     jjnr             = nlist->jjnr;
1235     shiftidx         = nlist->shift;
1236     gid              = nlist->gid;
1237     shiftvec         = fr->shift_vec[0];
1238     fshift           = fr->fshift[0];
1239     facel            = _mm256_set1_ps(fr->epsfac);
1240     charge           = mdatoms->chargeA;
1241     krf              = _mm256_set1_ps(fr->ic->k_rf);
1242     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1243     crf              = _mm256_set1_ps(fr->ic->c_rf);
1244
1245     /* Setup water-specific parameters */
1246     inr              = nlist->iinr[0];
1247     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1248     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1249     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1250
1251     jq1              = _mm256_set1_ps(charge[inr+1]);
1252     jq2              = _mm256_set1_ps(charge[inr+2]);
1253     jq3              = _mm256_set1_ps(charge[inr+3]);
1254     qq11             = _mm256_mul_ps(iq1,jq1);
1255     qq12             = _mm256_mul_ps(iq1,jq2);
1256     qq13             = _mm256_mul_ps(iq1,jq3);
1257     qq21             = _mm256_mul_ps(iq2,jq1);
1258     qq22             = _mm256_mul_ps(iq2,jq2);
1259     qq23             = _mm256_mul_ps(iq2,jq3);
1260     qq31             = _mm256_mul_ps(iq3,jq1);
1261     qq32             = _mm256_mul_ps(iq3,jq2);
1262     qq33             = _mm256_mul_ps(iq3,jq3);
1263
1264     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1265     rcutoff_scalar   = fr->rcoulomb;
1266     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1267     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1268
1269     /* Avoid stupid compiler warnings */
1270     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1271     j_coord_offsetA = 0;
1272     j_coord_offsetB = 0;
1273     j_coord_offsetC = 0;
1274     j_coord_offsetD = 0;
1275     j_coord_offsetE = 0;
1276     j_coord_offsetF = 0;
1277     j_coord_offsetG = 0;
1278     j_coord_offsetH = 0;
1279
1280     outeriter        = 0;
1281     inneriter        = 0;
1282
1283     for(iidx=0;iidx<4*DIM;iidx++)
1284     {
1285         scratch[iidx] = 0.0;
1286     }
1287
1288     /* Start outer loop over neighborlists */
1289     for(iidx=0; iidx<nri; iidx++)
1290     {
1291         /* Load shift vector for this list */
1292         i_shift_offset   = DIM*shiftidx[iidx];
1293
1294         /* Load limits for loop over neighbors */
1295         j_index_start    = jindex[iidx];
1296         j_index_end      = jindex[iidx+1];
1297
1298         /* Get outer coordinate index */
1299         inr              = iinr[iidx];
1300         i_coord_offset   = DIM*inr;
1301
1302         /* Load i particle coords and add shift vector */
1303         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1304                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1305
1306         fix1             = _mm256_setzero_ps();
1307         fiy1             = _mm256_setzero_ps();
1308         fiz1             = _mm256_setzero_ps();
1309         fix2             = _mm256_setzero_ps();
1310         fiy2             = _mm256_setzero_ps();
1311         fiz2             = _mm256_setzero_ps();
1312         fix3             = _mm256_setzero_ps();
1313         fiy3             = _mm256_setzero_ps();
1314         fiz3             = _mm256_setzero_ps();
1315
1316         /* Start inner kernel loop */
1317         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1318         {
1319
1320             /* Get j neighbor index, and coordinate index */
1321             jnrA             = jjnr[jidx];
1322             jnrB             = jjnr[jidx+1];
1323             jnrC             = jjnr[jidx+2];
1324             jnrD             = jjnr[jidx+3];
1325             jnrE             = jjnr[jidx+4];
1326             jnrF             = jjnr[jidx+5];
1327             jnrG             = jjnr[jidx+6];
1328             jnrH             = jjnr[jidx+7];
1329             j_coord_offsetA  = DIM*jnrA;
1330             j_coord_offsetB  = DIM*jnrB;
1331             j_coord_offsetC  = DIM*jnrC;
1332             j_coord_offsetD  = DIM*jnrD;
1333             j_coord_offsetE  = DIM*jnrE;
1334             j_coord_offsetF  = DIM*jnrF;
1335             j_coord_offsetG  = DIM*jnrG;
1336             j_coord_offsetH  = DIM*jnrH;
1337
1338             /* load j atom coordinates */
1339             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1340                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1341                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1342                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1343                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1344
1345             /* Calculate displacement vector */
1346             dx11             = _mm256_sub_ps(ix1,jx1);
1347             dy11             = _mm256_sub_ps(iy1,jy1);
1348             dz11             = _mm256_sub_ps(iz1,jz1);
1349             dx12             = _mm256_sub_ps(ix1,jx2);
1350             dy12             = _mm256_sub_ps(iy1,jy2);
1351             dz12             = _mm256_sub_ps(iz1,jz2);
1352             dx13             = _mm256_sub_ps(ix1,jx3);
1353             dy13             = _mm256_sub_ps(iy1,jy3);
1354             dz13             = _mm256_sub_ps(iz1,jz3);
1355             dx21             = _mm256_sub_ps(ix2,jx1);
1356             dy21             = _mm256_sub_ps(iy2,jy1);
1357             dz21             = _mm256_sub_ps(iz2,jz1);
1358             dx22             = _mm256_sub_ps(ix2,jx2);
1359             dy22             = _mm256_sub_ps(iy2,jy2);
1360             dz22             = _mm256_sub_ps(iz2,jz2);
1361             dx23             = _mm256_sub_ps(ix2,jx3);
1362             dy23             = _mm256_sub_ps(iy2,jy3);
1363             dz23             = _mm256_sub_ps(iz2,jz3);
1364             dx31             = _mm256_sub_ps(ix3,jx1);
1365             dy31             = _mm256_sub_ps(iy3,jy1);
1366             dz31             = _mm256_sub_ps(iz3,jz1);
1367             dx32             = _mm256_sub_ps(ix3,jx2);
1368             dy32             = _mm256_sub_ps(iy3,jy2);
1369             dz32             = _mm256_sub_ps(iz3,jz2);
1370             dx33             = _mm256_sub_ps(ix3,jx3);
1371             dy33             = _mm256_sub_ps(iy3,jy3);
1372             dz33             = _mm256_sub_ps(iz3,jz3);
1373
1374             /* Calculate squared distance and things based on it */
1375             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1376             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1377             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1378             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1379             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1380             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1381             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1382             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1383             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1384
1385             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1386             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1387             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1388             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1389             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1390             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1391             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1392             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1393             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1394
1395             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1396             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1397             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1398             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1399             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1400             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1401             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1402             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1403             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1404
1405             fjx1             = _mm256_setzero_ps();
1406             fjy1             = _mm256_setzero_ps();
1407             fjz1             = _mm256_setzero_ps();
1408             fjx2             = _mm256_setzero_ps();
1409             fjy2             = _mm256_setzero_ps();
1410             fjz2             = _mm256_setzero_ps();
1411             fjx3             = _mm256_setzero_ps();
1412             fjy3             = _mm256_setzero_ps();
1413             fjz3             = _mm256_setzero_ps();
1414
1415             /**************************
1416              * CALCULATE INTERACTIONS *
1417              **************************/
1418
1419             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1420             {
1421
1422             /* REACTION-FIELD ELECTROSTATICS */
1423             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1424
1425             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1426
1427             fscal            = felec;
1428
1429             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1430
1431             /* Calculate temporary vectorial force */
1432             tx               = _mm256_mul_ps(fscal,dx11);
1433             ty               = _mm256_mul_ps(fscal,dy11);
1434             tz               = _mm256_mul_ps(fscal,dz11);
1435
1436             /* Update vectorial force */
1437             fix1             = _mm256_add_ps(fix1,tx);
1438             fiy1             = _mm256_add_ps(fiy1,ty);
1439             fiz1             = _mm256_add_ps(fiz1,tz);
1440
1441             fjx1             = _mm256_add_ps(fjx1,tx);
1442             fjy1             = _mm256_add_ps(fjy1,ty);
1443             fjz1             = _mm256_add_ps(fjz1,tz);
1444
1445             }
1446
1447             /**************************
1448              * CALCULATE INTERACTIONS *
1449              **************************/
1450
1451             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1452             {
1453
1454             /* REACTION-FIELD ELECTROSTATICS */
1455             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1456
1457             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1458
1459             fscal            = felec;
1460
1461             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1462
1463             /* Calculate temporary vectorial force */
1464             tx               = _mm256_mul_ps(fscal,dx12);
1465             ty               = _mm256_mul_ps(fscal,dy12);
1466             tz               = _mm256_mul_ps(fscal,dz12);
1467
1468             /* Update vectorial force */
1469             fix1             = _mm256_add_ps(fix1,tx);
1470             fiy1             = _mm256_add_ps(fiy1,ty);
1471             fiz1             = _mm256_add_ps(fiz1,tz);
1472
1473             fjx2             = _mm256_add_ps(fjx2,tx);
1474             fjy2             = _mm256_add_ps(fjy2,ty);
1475             fjz2             = _mm256_add_ps(fjz2,tz);
1476
1477             }
1478
1479             /**************************
1480              * CALCULATE INTERACTIONS *
1481              **************************/
1482
1483             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1484             {
1485
1486             /* REACTION-FIELD ELECTROSTATICS */
1487             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1488
1489             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1490
1491             fscal            = felec;
1492
1493             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1494
1495             /* Calculate temporary vectorial force */
1496             tx               = _mm256_mul_ps(fscal,dx13);
1497             ty               = _mm256_mul_ps(fscal,dy13);
1498             tz               = _mm256_mul_ps(fscal,dz13);
1499
1500             /* Update vectorial force */
1501             fix1             = _mm256_add_ps(fix1,tx);
1502             fiy1             = _mm256_add_ps(fiy1,ty);
1503             fiz1             = _mm256_add_ps(fiz1,tz);
1504
1505             fjx3             = _mm256_add_ps(fjx3,tx);
1506             fjy3             = _mm256_add_ps(fjy3,ty);
1507             fjz3             = _mm256_add_ps(fjz3,tz);
1508
1509             }
1510
1511             /**************************
1512              * CALCULATE INTERACTIONS *
1513              **************************/
1514
1515             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1516             {
1517
1518             /* REACTION-FIELD ELECTROSTATICS */
1519             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1520
1521             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1522
1523             fscal            = felec;
1524
1525             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1526
1527             /* Calculate temporary vectorial force */
1528             tx               = _mm256_mul_ps(fscal,dx21);
1529             ty               = _mm256_mul_ps(fscal,dy21);
1530             tz               = _mm256_mul_ps(fscal,dz21);
1531
1532             /* Update vectorial force */
1533             fix2             = _mm256_add_ps(fix2,tx);
1534             fiy2             = _mm256_add_ps(fiy2,ty);
1535             fiz2             = _mm256_add_ps(fiz2,tz);
1536
1537             fjx1             = _mm256_add_ps(fjx1,tx);
1538             fjy1             = _mm256_add_ps(fjy1,ty);
1539             fjz1             = _mm256_add_ps(fjz1,tz);
1540
1541             }
1542
1543             /**************************
1544              * CALCULATE INTERACTIONS *
1545              **************************/
1546
1547             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1548             {
1549
1550             /* REACTION-FIELD ELECTROSTATICS */
1551             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1552
1553             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1554
1555             fscal            = felec;
1556
1557             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1558
1559             /* Calculate temporary vectorial force */
1560             tx               = _mm256_mul_ps(fscal,dx22);
1561             ty               = _mm256_mul_ps(fscal,dy22);
1562             tz               = _mm256_mul_ps(fscal,dz22);
1563
1564             /* Update vectorial force */
1565             fix2             = _mm256_add_ps(fix2,tx);
1566             fiy2             = _mm256_add_ps(fiy2,ty);
1567             fiz2             = _mm256_add_ps(fiz2,tz);
1568
1569             fjx2             = _mm256_add_ps(fjx2,tx);
1570             fjy2             = _mm256_add_ps(fjy2,ty);
1571             fjz2             = _mm256_add_ps(fjz2,tz);
1572
1573             }
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1580             {
1581
1582             /* REACTION-FIELD ELECTROSTATICS */
1583             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1584
1585             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1586
1587             fscal            = felec;
1588
1589             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1590
1591             /* Calculate temporary vectorial force */
1592             tx               = _mm256_mul_ps(fscal,dx23);
1593             ty               = _mm256_mul_ps(fscal,dy23);
1594             tz               = _mm256_mul_ps(fscal,dz23);
1595
1596             /* Update vectorial force */
1597             fix2             = _mm256_add_ps(fix2,tx);
1598             fiy2             = _mm256_add_ps(fiy2,ty);
1599             fiz2             = _mm256_add_ps(fiz2,tz);
1600
1601             fjx3             = _mm256_add_ps(fjx3,tx);
1602             fjy3             = _mm256_add_ps(fjy3,ty);
1603             fjz3             = _mm256_add_ps(fjz3,tz);
1604
1605             }
1606
1607             /**************************
1608              * CALCULATE INTERACTIONS *
1609              **************************/
1610
1611             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1612             {
1613
1614             /* REACTION-FIELD ELECTROSTATICS */
1615             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1616
1617             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1618
1619             fscal            = felec;
1620
1621             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1622
1623             /* Calculate temporary vectorial force */
1624             tx               = _mm256_mul_ps(fscal,dx31);
1625             ty               = _mm256_mul_ps(fscal,dy31);
1626             tz               = _mm256_mul_ps(fscal,dz31);
1627
1628             /* Update vectorial force */
1629             fix3             = _mm256_add_ps(fix3,tx);
1630             fiy3             = _mm256_add_ps(fiy3,ty);
1631             fiz3             = _mm256_add_ps(fiz3,tz);
1632
1633             fjx1             = _mm256_add_ps(fjx1,tx);
1634             fjy1             = _mm256_add_ps(fjy1,ty);
1635             fjz1             = _mm256_add_ps(fjz1,tz);
1636
1637             }
1638
1639             /**************************
1640              * CALCULATE INTERACTIONS *
1641              **************************/
1642
1643             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1644             {
1645
1646             /* REACTION-FIELD ELECTROSTATICS */
1647             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1648
1649             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1650
1651             fscal            = felec;
1652
1653             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1654
1655             /* Calculate temporary vectorial force */
1656             tx               = _mm256_mul_ps(fscal,dx32);
1657             ty               = _mm256_mul_ps(fscal,dy32);
1658             tz               = _mm256_mul_ps(fscal,dz32);
1659
1660             /* Update vectorial force */
1661             fix3             = _mm256_add_ps(fix3,tx);
1662             fiy3             = _mm256_add_ps(fiy3,ty);
1663             fiz3             = _mm256_add_ps(fiz3,tz);
1664
1665             fjx2             = _mm256_add_ps(fjx2,tx);
1666             fjy2             = _mm256_add_ps(fjy2,ty);
1667             fjz2             = _mm256_add_ps(fjz2,tz);
1668
1669             }
1670
1671             /**************************
1672              * CALCULATE INTERACTIONS *
1673              **************************/
1674
1675             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1676             {
1677
1678             /* REACTION-FIELD ELECTROSTATICS */
1679             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1680
1681             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1682
1683             fscal            = felec;
1684
1685             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1686
1687             /* Calculate temporary vectorial force */
1688             tx               = _mm256_mul_ps(fscal,dx33);
1689             ty               = _mm256_mul_ps(fscal,dy33);
1690             tz               = _mm256_mul_ps(fscal,dz33);
1691
1692             /* Update vectorial force */
1693             fix3             = _mm256_add_ps(fix3,tx);
1694             fiy3             = _mm256_add_ps(fiy3,ty);
1695             fiz3             = _mm256_add_ps(fiz3,tz);
1696
1697             fjx3             = _mm256_add_ps(fjx3,tx);
1698             fjy3             = _mm256_add_ps(fjy3,ty);
1699             fjz3             = _mm256_add_ps(fjz3,tz);
1700
1701             }
1702
1703             fjptrA             = f+j_coord_offsetA;
1704             fjptrB             = f+j_coord_offsetB;
1705             fjptrC             = f+j_coord_offsetC;
1706             fjptrD             = f+j_coord_offsetD;
1707             fjptrE             = f+j_coord_offsetE;
1708             fjptrF             = f+j_coord_offsetF;
1709             fjptrG             = f+j_coord_offsetG;
1710             fjptrH             = f+j_coord_offsetH;
1711
1712             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1713                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1714                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1715
1716             /* Inner loop uses 270 flops */
1717         }
1718
1719         if(jidx<j_index_end)
1720         {
1721
1722             /* Get j neighbor index, and coordinate index */
1723             jnrlistA         = jjnr[jidx];
1724             jnrlistB         = jjnr[jidx+1];
1725             jnrlistC         = jjnr[jidx+2];
1726             jnrlistD         = jjnr[jidx+3];
1727             jnrlistE         = jjnr[jidx+4];
1728             jnrlistF         = jjnr[jidx+5];
1729             jnrlistG         = jjnr[jidx+6];
1730             jnrlistH         = jjnr[jidx+7];
1731             /* Sign of each element will be negative for non-real atoms.
1732              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1733              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1734              */
1735             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1736                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1737                                             
1738             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1739             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1740             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1741             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1742             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1743             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1744             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1745             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1746             j_coord_offsetA  = DIM*jnrA;
1747             j_coord_offsetB  = DIM*jnrB;
1748             j_coord_offsetC  = DIM*jnrC;
1749             j_coord_offsetD  = DIM*jnrD;
1750             j_coord_offsetE  = DIM*jnrE;
1751             j_coord_offsetF  = DIM*jnrF;
1752             j_coord_offsetG  = DIM*jnrG;
1753             j_coord_offsetH  = DIM*jnrH;
1754
1755             /* load j atom coordinates */
1756             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1757                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1758                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1759                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1760                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1761
1762             /* Calculate displacement vector */
1763             dx11             = _mm256_sub_ps(ix1,jx1);
1764             dy11             = _mm256_sub_ps(iy1,jy1);
1765             dz11             = _mm256_sub_ps(iz1,jz1);
1766             dx12             = _mm256_sub_ps(ix1,jx2);
1767             dy12             = _mm256_sub_ps(iy1,jy2);
1768             dz12             = _mm256_sub_ps(iz1,jz2);
1769             dx13             = _mm256_sub_ps(ix1,jx3);
1770             dy13             = _mm256_sub_ps(iy1,jy3);
1771             dz13             = _mm256_sub_ps(iz1,jz3);
1772             dx21             = _mm256_sub_ps(ix2,jx1);
1773             dy21             = _mm256_sub_ps(iy2,jy1);
1774             dz21             = _mm256_sub_ps(iz2,jz1);
1775             dx22             = _mm256_sub_ps(ix2,jx2);
1776             dy22             = _mm256_sub_ps(iy2,jy2);
1777             dz22             = _mm256_sub_ps(iz2,jz2);
1778             dx23             = _mm256_sub_ps(ix2,jx3);
1779             dy23             = _mm256_sub_ps(iy2,jy3);
1780             dz23             = _mm256_sub_ps(iz2,jz3);
1781             dx31             = _mm256_sub_ps(ix3,jx1);
1782             dy31             = _mm256_sub_ps(iy3,jy1);
1783             dz31             = _mm256_sub_ps(iz3,jz1);
1784             dx32             = _mm256_sub_ps(ix3,jx2);
1785             dy32             = _mm256_sub_ps(iy3,jy2);
1786             dz32             = _mm256_sub_ps(iz3,jz2);
1787             dx33             = _mm256_sub_ps(ix3,jx3);
1788             dy33             = _mm256_sub_ps(iy3,jy3);
1789             dz33             = _mm256_sub_ps(iz3,jz3);
1790
1791             /* Calculate squared distance and things based on it */
1792             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1793             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1794             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1795             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1796             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1797             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1798             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1799             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1800             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1801
1802             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1803             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1804             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1805             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1806             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1807             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1808             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1809             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1810             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1811
1812             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1813             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1814             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1815             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1816             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1817             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1818             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1819             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1820             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1821
1822             fjx1             = _mm256_setzero_ps();
1823             fjy1             = _mm256_setzero_ps();
1824             fjz1             = _mm256_setzero_ps();
1825             fjx2             = _mm256_setzero_ps();
1826             fjy2             = _mm256_setzero_ps();
1827             fjz2             = _mm256_setzero_ps();
1828             fjx3             = _mm256_setzero_ps();
1829             fjy3             = _mm256_setzero_ps();
1830             fjz3             = _mm256_setzero_ps();
1831
1832             /**************************
1833              * CALCULATE INTERACTIONS *
1834              **************************/
1835
1836             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1837             {
1838
1839             /* REACTION-FIELD ELECTROSTATICS */
1840             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1841
1842             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1843
1844             fscal            = felec;
1845
1846             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1847
1848             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1849
1850             /* Calculate temporary vectorial force */
1851             tx               = _mm256_mul_ps(fscal,dx11);
1852             ty               = _mm256_mul_ps(fscal,dy11);
1853             tz               = _mm256_mul_ps(fscal,dz11);
1854
1855             /* Update vectorial force */
1856             fix1             = _mm256_add_ps(fix1,tx);
1857             fiy1             = _mm256_add_ps(fiy1,ty);
1858             fiz1             = _mm256_add_ps(fiz1,tz);
1859
1860             fjx1             = _mm256_add_ps(fjx1,tx);
1861             fjy1             = _mm256_add_ps(fjy1,ty);
1862             fjz1             = _mm256_add_ps(fjz1,tz);
1863
1864             }
1865
1866             /**************************
1867              * CALCULATE INTERACTIONS *
1868              **************************/
1869
1870             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1871             {
1872
1873             /* REACTION-FIELD ELECTROSTATICS */
1874             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1875
1876             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1877
1878             fscal            = felec;
1879
1880             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1881
1882             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1883
1884             /* Calculate temporary vectorial force */
1885             tx               = _mm256_mul_ps(fscal,dx12);
1886             ty               = _mm256_mul_ps(fscal,dy12);
1887             tz               = _mm256_mul_ps(fscal,dz12);
1888
1889             /* Update vectorial force */
1890             fix1             = _mm256_add_ps(fix1,tx);
1891             fiy1             = _mm256_add_ps(fiy1,ty);
1892             fiz1             = _mm256_add_ps(fiz1,tz);
1893
1894             fjx2             = _mm256_add_ps(fjx2,tx);
1895             fjy2             = _mm256_add_ps(fjy2,ty);
1896             fjz2             = _mm256_add_ps(fjz2,tz);
1897
1898             }
1899
1900             /**************************
1901              * CALCULATE INTERACTIONS *
1902              **************************/
1903
1904             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1905             {
1906
1907             /* REACTION-FIELD ELECTROSTATICS */
1908             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1909
1910             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1911
1912             fscal            = felec;
1913
1914             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1915
1916             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1917
1918             /* Calculate temporary vectorial force */
1919             tx               = _mm256_mul_ps(fscal,dx13);
1920             ty               = _mm256_mul_ps(fscal,dy13);
1921             tz               = _mm256_mul_ps(fscal,dz13);
1922
1923             /* Update vectorial force */
1924             fix1             = _mm256_add_ps(fix1,tx);
1925             fiy1             = _mm256_add_ps(fiy1,ty);
1926             fiz1             = _mm256_add_ps(fiz1,tz);
1927
1928             fjx3             = _mm256_add_ps(fjx3,tx);
1929             fjy3             = _mm256_add_ps(fjy3,ty);
1930             fjz3             = _mm256_add_ps(fjz3,tz);
1931
1932             }
1933
1934             /**************************
1935              * CALCULATE INTERACTIONS *
1936              **************************/
1937
1938             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1939             {
1940
1941             /* REACTION-FIELD ELECTROSTATICS */
1942             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1943
1944             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1945
1946             fscal            = felec;
1947
1948             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1949
1950             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1951
1952             /* Calculate temporary vectorial force */
1953             tx               = _mm256_mul_ps(fscal,dx21);
1954             ty               = _mm256_mul_ps(fscal,dy21);
1955             tz               = _mm256_mul_ps(fscal,dz21);
1956
1957             /* Update vectorial force */
1958             fix2             = _mm256_add_ps(fix2,tx);
1959             fiy2             = _mm256_add_ps(fiy2,ty);
1960             fiz2             = _mm256_add_ps(fiz2,tz);
1961
1962             fjx1             = _mm256_add_ps(fjx1,tx);
1963             fjy1             = _mm256_add_ps(fjy1,ty);
1964             fjz1             = _mm256_add_ps(fjz1,tz);
1965
1966             }
1967
1968             /**************************
1969              * CALCULATE INTERACTIONS *
1970              **************************/
1971
1972             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1973             {
1974
1975             /* REACTION-FIELD ELECTROSTATICS */
1976             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1977
1978             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1979
1980             fscal            = felec;
1981
1982             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1983
1984             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1985
1986             /* Calculate temporary vectorial force */
1987             tx               = _mm256_mul_ps(fscal,dx22);
1988             ty               = _mm256_mul_ps(fscal,dy22);
1989             tz               = _mm256_mul_ps(fscal,dz22);
1990
1991             /* Update vectorial force */
1992             fix2             = _mm256_add_ps(fix2,tx);
1993             fiy2             = _mm256_add_ps(fiy2,ty);
1994             fiz2             = _mm256_add_ps(fiz2,tz);
1995
1996             fjx2             = _mm256_add_ps(fjx2,tx);
1997             fjy2             = _mm256_add_ps(fjy2,ty);
1998             fjz2             = _mm256_add_ps(fjz2,tz);
1999
2000             }
2001
2002             /**************************
2003              * CALCULATE INTERACTIONS *
2004              **************************/
2005
2006             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2007             {
2008
2009             /* REACTION-FIELD ELECTROSTATICS */
2010             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
2011
2012             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2013
2014             fscal            = felec;
2015
2016             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2017
2018             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2019
2020             /* Calculate temporary vectorial force */
2021             tx               = _mm256_mul_ps(fscal,dx23);
2022             ty               = _mm256_mul_ps(fscal,dy23);
2023             tz               = _mm256_mul_ps(fscal,dz23);
2024
2025             /* Update vectorial force */
2026             fix2             = _mm256_add_ps(fix2,tx);
2027             fiy2             = _mm256_add_ps(fiy2,ty);
2028             fiz2             = _mm256_add_ps(fiz2,tz);
2029
2030             fjx3             = _mm256_add_ps(fjx3,tx);
2031             fjy3             = _mm256_add_ps(fjy3,ty);
2032             fjz3             = _mm256_add_ps(fjz3,tz);
2033
2034             }
2035
2036             /**************************
2037              * CALCULATE INTERACTIONS *
2038              **************************/
2039
2040             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2041             {
2042
2043             /* REACTION-FIELD ELECTROSTATICS */
2044             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
2045
2046             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2047
2048             fscal            = felec;
2049
2050             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2051
2052             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2053
2054             /* Calculate temporary vectorial force */
2055             tx               = _mm256_mul_ps(fscal,dx31);
2056             ty               = _mm256_mul_ps(fscal,dy31);
2057             tz               = _mm256_mul_ps(fscal,dz31);
2058
2059             /* Update vectorial force */
2060             fix3             = _mm256_add_ps(fix3,tx);
2061             fiy3             = _mm256_add_ps(fiy3,ty);
2062             fiz3             = _mm256_add_ps(fiz3,tz);
2063
2064             fjx1             = _mm256_add_ps(fjx1,tx);
2065             fjy1             = _mm256_add_ps(fjy1,ty);
2066             fjz1             = _mm256_add_ps(fjz1,tz);
2067
2068             }
2069
2070             /**************************
2071              * CALCULATE INTERACTIONS *
2072              **************************/
2073
2074             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2075             {
2076
2077             /* REACTION-FIELD ELECTROSTATICS */
2078             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
2079
2080             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2081
2082             fscal            = felec;
2083
2084             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2085
2086             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2087
2088             /* Calculate temporary vectorial force */
2089             tx               = _mm256_mul_ps(fscal,dx32);
2090             ty               = _mm256_mul_ps(fscal,dy32);
2091             tz               = _mm256_mul_ps(fscal,dz32);
2092
2093             /* Update vectorial force */
2094             fix3             = _mm256_add_ps(fix3,tx);
2095             fiy3             = _mm256_add_ps(fiy3,ty);
2096             fiz3             = _mm256_add_ps(fiz3,tz);
2097
2098             fjx2             = _mm256_add_ps(fjx2,tx);
2099             fjy2             = _mm256_add_ps(fjy2,ty);
2100             fjz2             = _mm256_add_ps(fjz2,tz);
2101
2102             }
2103
2104             /**************************
2105              * CALCULATE INTERACTIONS *
2106              **************************/
2107
2108             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2109             {
2110
2111             /* REACTION-FIELD ELECTROSTATICS */
2112             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
2113
2114             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2115
2116             fscal            = felec;
2117
2118             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2119
2120             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2121
2122             /* Calculate temporary vectorial force */
2123             tx               = _mm256_mul_ps(fscal,dx33);
2124             ty               = _mm256_mul_ps(fscal,dy33);
2125             tz               = _mm256_mul_ps(fscal,dz33);
2126
2127             /* Update vectorial force */
2128             fix3             = _mm256_add_ps(fix3,tx);
2129             fiy3             = _mm256_add_ps(fiy3,ty);
2130             fiz3             = _mm256_add_ps(fiz3,tz);
2131
2132             fjx3             = _mm256_add_ps(fjx3,tx);
2133             fjy3             = _mm256_add_ps(fjy3,ty);
2134             fjz3             = _mm256_add_ps(fjz3,tz);
2135
2136             }
2137
2138             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2139             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2140             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2141             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2142             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2143             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2144             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2145             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2146
2147             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2148                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2149                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2150
2151             /* Inner loop uses 270 flops */
2152         }
2153
2154         /* End of innermost loop */
2155
2156         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2157                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
2158
2159         /* Increment number of inner iterations */
2160         inneriter                  += j_index_end - j_index_start;
2161
2162         /* Outer loop uses 18 flops */
2163     }
2164
2165     /* Increment number of outer iterations */
2166     outeriter        += nri;
2167
2168     /* Update outer/inner flops */
2169
2170     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
2171 }