Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecRFCut_VdwNone_GeomW3W3_avx_256_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_double
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            None
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     __m256d          dummy_mask,cutoff_mask;
107     __m128           tmpmask0,tmpmask1;
108     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
109     __m256d          one     = _mm256_set1_pd(1.0);
110     __m256d          two     = _mm256_set1_pd(2.0);
111     x                = xx[0];
112     f                = ff[0];
113
114     nri              = nlist->nri;
115     iinr             = nlist->iinr;
116     jindex           = nlist->jindex;
117     jjnr             = nlist->jjnr;
118     shiftidx         = nlist->shift;
119     gid              = nlist->gid;
120     shiftvec         = fr->shift_vec[0];
121     fshift           = fr->fshift[0];
122     facel            = _mm256_set1_pd(fr->ic->epsfac);
123     charge           = mdatoms->chargeA;
124     krf              = _mm256_set1_pd(fr->ic->k_rf);
125     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
126     crf              = _mm256_set1_pd(fr->ic->c_rf);
127
128     /* Setup water-specific parameters */
129     inr              = nlist->iinr[0];
130     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
131     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
132     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
133
134     jq0              = _mm256_set1_pd(charge[inr+0]);
135     jq1              = _mm256_set1_pd(charge[inr+1]);
136     jq2              = _mm256_set1_pd(charge[inr+2]);
137     qq00             = _mm256_mul_pd(iq0,jq0);
138     qq01             = _mm256_mul_pd(iq0,jq1);
139     qq02             = _mm256_mul_pd(iq0,jq2);
140     qq10             = _mm256_mul_pd(iq1,jq0);
141     qq11             = _mm256_mul_pd(iq1,jq1);
142     qq12             = _mm256_mul_pd(iq1,jq2);
143     qq20             = _mm256_mul_pd(iq2,jq0);
144     qq21             = _mm256_mul_pd(iq2,jq1);
145     qq22             = _mm256_mul_pd(iq2,jq2);
146
147     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148     rcutoff_scalar   = fr->ic->rcoulomb;
149     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
150     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
151
152     /* Avoid stupid compiler warnings */
153     jnrA = jnrB = jnrC = jnrD = 0;
154     j_coord_offsetA = 0;
155     j_coord_offsetB = 0;
156     j_coord_offsetC = 0;
157     j_coord_offsetD = 0;
158
159     outeriter        = 0;
160     inneriter        = 0;
161
162     for(iidx=0;iidx<4*DIM;iidx++)
163     {
164         scratch[iidx] = 0.0;
165     }
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
184
185         fix0             = _mm256_setzero_pd();
186         fiy0             = _mm256_setzero_pd();
187         fiz0             = _mm256_setzero_pd();
188         fix1             = _mm256_setzero_pd();
189         fiy1             = _mm256_setzero_pd();
190         fiz1             = _mm256_setzero_pd();
191         fix2             = _mm256_setzero_pd();
192         fiy2             = _mm256_setzero_pd();
193         fiz2             = _mm256_setzero_pd();
194
195         /* Reset potential sums */
196         velecsum         = _mm256_setzero_pd();
197
198         /* Start inner kernel loop */
199         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200         {
201
202             /* Get j neighbor index, and coordinate index */
203             jnrA             = jjnr[jidx];
204             jnrB             = jjnr[jidx+1];
205             jnrC             = jjnr[jidx+2];
206             jnrD             = jjnr[jidx+3];
207             j_coord_offsetA  = DIM*jnrA;
208             j_coord_offsetB  = DIM*jnrB;
209             j_coord_offsetC  = DIM*jnrC;
210             j_coord_offsetD  = DIM*jnrD;
211
212             /* load j atom coordinates */
213             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214                                                  x+j_coord_offsetC,x+j_coord_offsetD,
215                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
216
217             /* Calculate displacement vector */
218             dx00             = _mm256_sub_pd(ix0,jx0);
219             dy00             = _mm256_sub_pd(iy0,jy0);
220             dz00             = _mm256_sub_pd(iz0,jz0);
221             dx01             = _mm256_sub_pd(ix0,jx1);
222             dy01             = _mm256_sub_pd(iy0,jy1);
223             dz01             = _mm256_sub_pd(iz0,jz1);
224             dx02             = _mm256_sub_pd(ix0,jx2);
225             dy02             = _mm256_sub_pd(iy0,jy2);
226             dz02             = _mm256_sub_pd(iz0,jz2);
227             dx10             = _mm256_sub_pd(ix1,jx0);
228             dy10             = _mm256_sub_pd(iy1,jy0);
229             dz10             = _mm256_sub_pd(iz1,jz0);
230             dx11             = _mm256_sub_pd(ix1,jx1);
231             dy11             = _mm256_sub_pd(iy1,jy1);
232             dz11             = _mm256_sub_pd(iz1,jz1);
233             dx12             = _mm256_sub_pd(ix1,jx2);
234             dy12             = _mm256_sub_pd(iy1,jy2);
235             dz12             = _mm256_sub_pd(iz1,jz2);
236             dx20             = _mm256_sub_pd(ix2,jx0);
237             dy20             = _mm256_sub_pd(iy2,jy0);
238             dz20             = _mm256_sub_pd(iz2,jz0);
239             dx21             = _mm256_sub_pd(ix2,jx1);
240             dy21             = _mm256_sub_pd(iy2,jy1);
241             dz21             = _mm256_sub_pd(iz2,jz1);
242             dx22             = _mm256_sub_pd(ix2,jx2);
243             dy22             = _mm256_sub_pd(iy2,jy2);
244             dz22             = _mm256_sub_pd(iz2,jz2);
245
246             /* Calculate squared distance and things based on it */
247             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
248             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
249             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
250             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
251             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
252             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
253             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
254             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
255             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
256
257             rinv00           = avx256_invsqrt_d(rsq00);
258             rinv01           = avx256_invsqrt_d(rsq01);
259             rinv02           = avx256_invsqrt_d(rsq02);
260             rinv10           = avx256_invsqrt_d(rsq10);
261             rinv11           = avx256_invsqrt_d(rsq11);
262             rinv12           = avx256_invsqrt_d(rsq12);
263             rinv20           = avx256_invsqrt_d(rsq20);
264             rinv21           = avx256_invsqrt_d(rsq21);
265             rinv22           = avx256_invsqrt_d(rsq22);
266
267             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
268             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
269             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
270             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
271             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
272             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
273             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
274             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
275             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
276
277             fjx0             = _mm256_setzero_pd();
278             fjy0             = _mm256_setzero_pd();
279             fjz0             = _mm256_setzero_pd();
280             fjx1             = _mm256_setzero_pd();
281             fjy1             = _mm256_setzero_pd();
282             fjz1             = _mm256_setzero_pd();
283             fjx2             = _mm256_setzero_pd();
284             fjy2             = _mm256_setzero_pd();
285             fjz2             = _mm256_setzero_pd();
286
287             /**************************
288              * CALCULATE INTERACTIONS *
289              **************************/
290
291             if (gmx_mm256_any_lt(rsq00,rcutoff2))
292             {
293
294             /* REACTION-FIELD ELECTROSTATICS */
295             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
296             felec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
297
298             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
299
300             /* Update potential sum for this i atom from the interaction with this j atom. */
301             velec            = _mm256_and_pd(velec,cutoff_mask);
302             velecsum         = _mm256_add_pd(velecsum,velec);
303
304             fscal            = felec;
305
306             fscal            = _mm256_and_pd(fscal,cutoff_mask);
307
308             /* Calculate temporary vectorial force */
309             tx               = _mm256_mul_pd(fscal,dx00);
310             ty               = _mm256_mul_pd(fscal,dy00);
311             tz               = _mm256_mul_pd(fscal,dz00);
312
313             /* Update vectorial force */
314             fix0             = _mm256_add_pd(fix0,tx);
315             fiy0             = _mm256_add_pd(fiy0,ty);
316             fiz0             = _mm256_add_pd(fiz0,tz);
317
318             fjx0             = _mm256_add_pd(fjx0,tx);
319             fjy0             = _mm256_add_pd(fjy0,ty);
320             fjz0             = _mm256_add_pd(fjz0,tz);
321
322             }
323
324             /**************************
325              * CALCULATE INTERACTIONS *
326              **************************/
327
328             if (gmx_mm256_any_lt(rsq01,rcutoff2))
329             {
330
331             /* REACTION-FIELD ELECTROSTATICS */
332             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
333             felec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
334
335             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
336
337             /* Update potential sum for this i atom from the interaction with this j atom. */
338             velec            = _mm256_and_pd(velec,cutoff_mask);
339             velecsum         = _mm256_add_pd(velecsum,velec);
340
341             fscal            = felec;
342
343             fscal            = _mm256_and_pd(fscal,cutoff_mask);
344
345             /* Calculate temporary vectorial force */
346             tx               = _mm256_mul_pd(fscal,dx01);
347             ty               = _mm256_mul_pd(fscal,dy01);
348             tz               = _mm256_mul_pd(fscal,dz01);
349
350             /* Update vectorial force */
351             fix0             = _mm256_add_pd(fix0,tx);
352             fiy0             = _mm256_add_pd(fiy0,ty);
353             fiz0             = _mm256_add_pd(fiz0,tz);
354
355             fjx1             = _mm256_add_pd(fjx1,tx);
356             fjy1             = _mm256_add_pd(fjy1,ty);
357             fjz1             = _mm256_add_pd(fjz1,tz);
358
359             }
360
361             /**************************
362              * CALCULATE INTERACTIONS *
363              **************************/
364
365             if (gmx_mm256_any_lt(rsq02,rcutoff2))
366             {
367
368             /* REACTION-FIELD ELECTROSTATICS */
369             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
370             felec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
371
372             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
373
374             /* Update potential sum for this i atom from the interaction with this j atom. */
375             velec            = _mm256_and_pd(velec,cutoff_mask);
376             velecsum         = _mm256_add_pd(velecsum,velec);
377
378             fscal            = felec;
379
380             fscal            = _mm256_and_pd(fscal,cutoff_mask);
381
382             /* Calculate temporary vectorial force */
383             tx               = _mm256_mul_pd(fscal,dx02);
384             ty               = _mm256_mul_pd(fscal,dy02);
385             tz               = _mm256_mul_pd(fscal,dz02);
386
387             /* Update vectorial force */
388             fix0             = _mm256_add_pd(fix0,tx);
389             fiy0             = _mm256_add_pd(fiy0,ty);
390             fiz0             = _mm256_add_pd(fiz0,tz);
391
392             fjx2             = _mm256_add_pd(fjx2,tx);
393             fjy2             = _mm256_add_pd(fjy2,ty);
394             fjz2             = _mm256_add_pd(fjz2,tz);
395
396             }
397
398             /**************************
399              * CALCULATE INTERACTIONS *
400              **************************/
401
402             if (gmx_mm256_any_lt(rsq10,rcutoff2))
403             {
404
405             /* REACTION-FIELD ELECTROSTATICS */
406             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
407             felec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
408
409             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
410
411             /* Update potential sum for this i atom from the interaction with this j atom. */
412             velec            = _mm256_and_pd(velec,cutoff_mask);
413             velecsum         = _mm256_add_pd(velecsum,velec);
414
415             fscal            = felec;
416
417             fscal            = _mm256_and_pd(fscal,cutoff_mask);
418
419             /* Calculate temporary vectorial force */
420             tx               = _mm256_mul_pd(fscal,dx10);
421             ty               = _mm256_mul_pd(fscal,dy10);
422             tz               = _mm256_mul_pd(fscal,dz10);
423
424             /* Update vectorial force */
425             fix1             = _mm256_add_pd(fix1,tx);
426             fiy1             = _mm256_add_pd(fiy1,ty);
427             fiz1             = _mm256_add_pd(fiz1,tz);
428
429             fjx0             = _mm256_add_pd(fjx0,tx);
430             fjy0             = _mm256_add_pd(fjy0,ty);
431             fjz0             = _mm256_add_pd(fjz0,tz);
432
433             }
434
435             /**************************
436              * CALCULATE INTERACTIONS *
437              **************************/
438
439             if (gmx_mm256_any_lt(rsq11,rcutoff2))
440             {
441
442             /* REACTION-FIELD ELECTROSTATICS */
443             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
444             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
445
446             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
447
448             /* Update potential sum for this i atom from the interaction with this j atom. */
449             velec            = _mm256_and_pd(velec,cutoff_mask);
450             velecsum         = _mm256_add_pd(velecsum,velec);
451
452             fscal            = felec;
453
454             fscal            = _mm256_and_pd(fscal,cutoff_mask);
455
456             /* Calculate temporary vectorial force */
457             tx               = _mm256_mul_pd(fscal,dx11);
458             ty               = _mm256_mul_pd(fscal,dy11);
459             tz               = _mm256_mul_pd(fscal,dz11);
460
461             /* Update vectorial force */
462             fix1             = _mm256_add_pd(fix1,tx);
463             fiy1             = _mm256_add_pd(fiy1,ty);
464             fiz1             = _mm256_add_pd(fiz1,tz);
465
466             fjx1             = _mm256_add_pd(fjx1,tx);
467             fjy1             = _mm256_add_pd(fjy1,ty);
468             fjz1             = _mm256_add_pd(fjz1,tz);
469
470             }
471
472             /**************************
473              * CALCULATE INTERACTIONS *
474              **************************/
475
476             if (gmx_mm256_any_lt(rsq12,rcutoff2))
477             {
478
479             /* REACTION-FIELD ELECTROSTATICS */
480             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
481             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
482
483             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
484
485             /* Update potential sum for this i atom from the interaction with this j atom. */
486             velec            = _mm256_and_pd(velec,cutoff_mask);
487             velecsum         = _mm256_add_pd(velecsum,velec);
488
489             fscal            = felec;
490
491             fscal            = _mm256_and_pd(fscal,cutoff_mask);
492
493             /* Calculate temporary vectorial force */
494             tx               = _mm256_mul_pd(fscal,dx12);
495             ty               = _mm256_mul_pd(fscal,dy12);
496             tz               = _mm256_mul_pd(fscal,dz12);
497
498             /* Update vectorial force */
499             fix1             = _mm256_add_pd(fix1,tx);
500             fiy1             = _mm256_add_pd(fiy1,ty);
501             fiz1             = _mm256_add_pd(fiz1,tz);
502
503             fjx2             = _mm256_add_pd(fjx2,tx);
504             fjy2             = _mm256_add_pd(fjy2,ty);
505             fjz2             = _mm256_add_pd(fjz2,tz);
506
507             }
508
509             /**************************
510              * CALCULATE INTERACTIONS *
511              **************************/
512
513             if (gmx_mm256_any_lt(rsq20,rcutoff2))
514             {
515
516             /* REACTION-FIELD ELECTROSTATICS */
517             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
518             felec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
519
520             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
521
522             /* Update potential sum for this i atom from the interaction with this j atom. */
523             velec            = _mm256_and_pd(velec,cutoff_mask);
524             velecsum         = _mm256_add_pd(velecsum,velec);
525
526             fscal            = felec;
527
528             fscal            = _mm256_and_pd(fscal,cutoff_mask);
529
530             /* Calculate temporary vectorial force */
531             tx               = _mm256_mul_pd(fscal,dx20);
532             ty               = _mm256_mul_pd(fscal,dy20);
533             tz               = _mm256_mul_pd(fscal,dz20);
534
535             /* Update vectorial force */
536             fix2             = _mm256_add_pd(fix2,tx);
537             fiy2             = _mm256_add_pd(fiy2,ty);
538             fiz2             = _mm256_add_pd(fiz2,tz);
539
540             fjx0             = _mm256_add_pd(fjx0,tx);
541             fjy0             = _mm256_add_pd(fjy0,ty);
542             fjz0             = _mm256_add_pd(fjz0,tz);
543
544             }
545
546             /**************************
547              * CALCULATE INTERACTIONS *
548              **************************/
549
550             if (gmx_mm256_any_lt(rsq21,rcutoff2))
551             {
552
553             /* REACTION-FIELD ELECTROSTATICS */
554             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
555             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
556
557             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
558
559             /* Update potential sum for this i atom from the interaction with this j atom. */
560             velec            = _mm256_and_pd(velec,cutoff_mask);
561             velecsum         = _mm256_add_pd(velecsum,velec);
562
563             fscal            = felec;
564
565             fscal            = _mm256_and_pd(fscal,cutoff_mask);
566
567             /* Calculate temporary vectorial force */
568             tx               = _mm256_mul_pd(fscal,dx21);
569             ty               = _mm256_mul_pd(fscal,dy21);
570             tz               = _mm256_mul_pd(fscal,dz21);
571
572             /* Update vectorial force */
573             fix2             = _mm256_add_pd(fix2,tx);
574             fiy2             = _mm256_add_pd(fiy2,ty);
575             fiz2             = _mm256_add_pd(fiz2,tz);
576
577             fjx1             = _mm256_add_pd(fjx1,tx);
578             fjy1             = _mm256_add_pd(fjy1,ty);
579             fjz1             = _mm256_add_pd(fjz1,tz);
580
581             }
582
583             /**************************
584              * CALCULATE INTERACTIONS *
585              **************************/
586
587             if (gmx_mm256_any_lt(rsq22,rcutoff2))
588             {
589
590             /* REACTION-FIELD ELECTROSTATICS */
591             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
592             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
593
594             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
595
596             /* Update potential sum for this i atom from the interaction with this j atom. */
597             velec            = _mm256_and_pd(velec,cutoff_mask);
598             velecsum         = _mm256_add_pd(velecsum,velec);
599
600             fscal            = felec;
601
602             fscal            = _mm256_and_pd(fscal,cutoff_mask);
603
604             /* Calculate temporary vectorial force */
605             tx               = _mm256_mul_pd(fscal,dx22);
606             ty               = _mm256_mul_pd(fscal,dy22);
607             tz               = _mm256_mul_pd(fscal,dz22);
608
609             /* Update vectorial force */
610             fix2             = _mm256_add_pd(fix2,tx);
611             fiy2             = _mm256_add_pd(fiy2,ty);
612             fiz2             = _mm256_add_pd(fiz2,tz);
613
614             fjx2             = _mm256_add_pd(fjx2,tx);
615             fjy2             = _mm256_add_pd(fjy2,ty);
616             fjz2             = _mm256_add_pd(fjz2,tz);
617
618             }
619
620             fjptrA             = f+j_coord_offsetA;
621             fjptrB             = f+j_coord_offsetB;
622             fjptrC             = f+j_coord_offsetC;
623             fjptrD             = f+j_coord_offsetD;
624
625             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
626                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
627
628             /* Inner loop uses 324 flops */
629         }
630
631         if(jidx<j_index_end)
632         {
633
634             /* Get j neighbor index, and coordinate index */
635             jnrlistA         = jjnr[jidx];
636             jnrlistB         = jjnr[jidx+1];
637             jnrlistC         = jjnr[jidx+2];
638             jnrlistD         = jjnr[jidx+3];
639             /* Sign of each element will be negative for non-real atoms.
640              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
641              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
642              */
643             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
644
645             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
646             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
647             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
648
649             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
650             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
651             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
652             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
653             j_coord_offsetA  = DIM*jnrA;
654             j_coord_offsetB  = DIM*jnrB;
655             j_coord_offsetC  = DIM*jnrC;
656             j_coord_offsetD  = DIM*jnrD;
657
658             /* load j atom coordinates */
659             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
660                                                  x+j_coord_offsetC,x+j_coord_offsetD,
661                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
662
663             /* Calculate displacement vector */
664             dx00             = _mm256_sub_pd(ix0,jx0);
665             dy00             = _mm256_sub_pd(iy0,jy0);
666             dz00             = _mm256_sub_pd(iz0,jz0);
667             dx01             = _mm256_sub_pd(ix0,jx1);
668             dy01             = _mm256_sub_pd(iy0,jy1);
669             dz01             = _mm256_sub_pd(iz0,jz1);
670             dx02             = _mm256_sub_pd(ix0,jx2);
671             dy02             = _mm256_sub_pd(iy0,jy2);
672             dz02             = _mm256_sub_pd(iz0,jz2);
673             dx10             = _mm256_sub_pd(ix1,jx0);
674             dy10             = _mm256_sub_pd(iy1,jy0);
675             dz10             = _mm256_sub_pd(iz1,jz0);
676             dx11             = _mm256_sub_pd(ix1,jx1);
677             dy11             = _mm256_sub_pd(iy1,jy1);
678             dz11             = _mm256_sub_pd(iz1,jz1);
679             dx12             = _mm256_sub_pd(ix1,jx2);
680             dy12             = _mm256_sub_pd(iy1,jy2);
681             dz12             = _mm256_sub_pd(iz1,jz2);
682             dx20             = _mm256_sub_pd(ix2,jx0);
683             dy20             = _mm256_sub_pd(iy2,jy0);
684             dz20             = _mm256_sub_pd(iz2,jz0);
685             dx21             = _mm256_sub_pd(ix2,jx1);
686             dy21             = _mm256_sub_pd(iy2,jy1);
687             dz21             = _mm256_sub_pd(iz2,jz1);
688             dx22             = _mm256_sub_pd(ix2,jx2);
689             dy22             = _mm256_sub_pd(iy2,jy2);
690             dz22             = _mm256_sub_pd(iz2,jz2);
691
692             /* Calculate squared distance and things based on it */
693             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
694             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
695             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
696             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
697             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
698             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
699             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
700             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
701             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
702
703             rinv00           = avx256_invsqrt_d(rsq00);
704             rinv01           = avx256_invsqrt_d(rsq01);
705             rinv02           = avx256_invsqrt_d(rsq02);
706             rinv10           = avx256_invsqrt_d(rsq10);
707             rinv11           = avx256_invsqrt_d(rsq11);
708             rinv12           = avx256_invsqrt_d(rsq12);
709             rinv20           = avx256_invsqrt_d(rsq20);
710             rinv21           = avx256_invsqrt_d(rsq21);
711             rinv22           = avx256_invsqrt_d(rsq22);
712
713             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
714             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
715             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
716             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
717             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
718             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
719             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
720             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
721             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
722
723             fjx0             = _mm256_setzero_pd();
724             fjy0             = _mm256_setzero_pd();
725             fjz0             = _mm256_setzero_pd();
726             fjx1             = _mm256_setzero_pd();
727             fjy1             = _mm256_setzero_pd();
728             fjz1             = _mm256_setzero_pd();
729             fjx2             = _mm256_setzero_pd();
730             fjy2             = _mm256_setzero_pd();
731             fjz2             = _mm256_setzero_pd();
732
733             /**************************
734              * CALCULATE INTERACTIONS *
735              **************************/
736
737             if (gmx_mm256_any_lt(rsq00,rcutoff2))
738             {
739
740             /* REACTION-FIELD ELECTROSTATICS */
741             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
742             felec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
743
744             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
745
746             /* Update potential sum for this i atom from the interaction with this j atom. */
747             velec            = _mm256_and_pd(velec,cutoff_mask);
748             velec            = _mm256_andnot_pd(dummy_mask,velec);
749             velecsum         = _mm256_add_pd(velecsum,velec);
750
751             fscal            = felec;
752
753             fscal            = _mm256_and_pd(fscal,cutoff_mask);
754
755             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
756
757             /* Calculate temporary vectorial force */
758             tx               = _mm256_mul_pd(fscal,dx00);
759             ty               = _mm256_mul_pd(fscal,dy00);
760             tz               = _mm256_mul_pd(fscal,dz00);
761
762             /* Update vectorial force */
763             fix0             = _mm256_add_pd(fix0,tx);
764             fiy0             = _mm256_add_pd(fiy0,ty);
765             fiz0             = _mm256_add_pd(fiz0,tz);
766
767             fjx0             = _mm256_add_pd(fjx0,tx);
768             fjy0             = _mm256_add_pd(fjy0,ty);
769             fjz0             = _mm256_add_pd(fjz0,tz);
770
771             }
772
773             /**************************
774              * CALCULATE INTERACTIONS *
775              **************************/
776
777             if (gmx_mm256_any_lt(rsq01,rcutoff2))
778             {
779
780             /* REACTION-FIELD ELECTROSTATICS */
781             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
782             felec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
783
784             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
785
786             /* Update potential sum for this i atom from the interaction with this j atom. */
787             velec            = _mm256_and_pd(velec,cutoff_mask);
788             velec            = _mm256_andnot_pd(dummy_mask,velec);
789             velecsum         = _mm256_add_pd(velecsum,velec);
790
791             fscal            = felec;
792
793             fscal            = _mm256_and_pd(fscal,cutoff_mask);
794
795             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
796
797             /* Calculate temporary vectorial force */
798             tx               = _mm256_mul_pd(fscal,dx01);
799             ty               = _mm256_mul_pd(fscal,dy01);
800             tz               = _mm256_mul_pd(fscal,dz01);
801
802             /* Update vectorial force */
803             fix0             = _mm256_add_pd(fix0,tx);
804             fiy0             = _mm256_add_pd(fiy0,ty);
805             fiz0             = _mm256_add_pd(fiz0,tz);
806
807             fjx1             = _mm256_add_pd(fjx1,tx);
808             fjy1             = _mm256_add_pd(fjy1,ty);
809             fjz1             = _mm256_add_pd(fjz1,tz);
810
811             }
812
813             /**************************
814              * CALCULATE INTERACTIONS *
815              **************************/
816
817             if (gmx_mm256_any_lt(rsq02,rcutoff2))
818             {
819
820             /* REACTION-FIELD ELECTROSTATICS */
821             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
822             felec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
823
824             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
825
826             /* Update potential sum for this i atom from the interaction with this j atom. */
827             velec            = _mm256_and_pd(velec,cutoff_mask);
828             velec            = _mm256_andnot_pd(dummy_mask,velec);
829             velecsum         = _mm256_add_pd(velecsum,velec);
830
831             fscal            = felec;
832
833             fscal            = _mm256_and_pd(fscal,cutoff_mask);
834
835             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
836
837             /* Calculate temporary vectorial force */
838             tx               = _mm256_mul_pd(fscal,dx02);
839             ty               = _mm256_mul_pd(fscal,dy02);
840             tz               = _mm256_mul_pd(fscal,dz02);
841
842             /* Update vectorial force */
843             fix0             = _mm256_add_pd(fix0,tx);
844             fiy0             = _mm256_add_pd(fiy0,ty);
845             fiz0             = _mm256_add_pd(fiz0,tz);
846
847             fjx2             = _mm256_add_pd(fjx2,tx);
848             fjy2             = _mm256_add_pd(fjy2,ty);
849             fjz2             = _mm256_add_pd(fjz2,tz);
850
851             }
852
853             /**************************
854              * CALCULATE INTERACTIONS *
855              **************************/
856
857             if (gmx_mm256_any_lt(rsq10,rcutoff2))
858             {
859
860             /* REACTION-FIELD ELECTROSTATICS */
861             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
862             felec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
863
864             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
865
866             /* Update potential sum for this i atom from the interaction with this j atom. */
867             velec            = _mm256_and_pd(velec,cutoff_mask);
868             velec            = _mm256_andnot_pd(dummy_mask,velec);
869             velecsum         = _mm256_add_pd(velecsum,velec);
870
871             fscal            = felec;
872
873             fscal            = _mm256_and_pd(fscal,cutoff_mask);
874
875             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
876
877             /* Calculate temporary vectorial force */
878             tx               = _mm256_mul_pd(fscal,dx10);
879             ty               = _mm256_mul_pd(fscal,dy10);
880             tz               = _mm256_mul_pd(fscal,dz10);
881
882             /* Update vectorial force */
883             fix1             = _mm256_add_pd(fix1,tx);
884             fiy1             = _mm256_add_pd(fiy1,ty);
885             fiz1             = _mm256_add_pd(fiz1,tz);
886
887             fjx0             = _mm256_add_pd(fjx0,tx);
888             fjy0             = _mm256_add_pd(fjy0,ty);
889             fjz0             = _mm256_add_pd(fjz0,tz);
890
891             }
892
893             /**************************
894              * CALCULATE INTERACTIONS *
895              **************************/
896
897             if (gmx_mm256_any_lt(rsq11,rcutoff2))
898             {
899
900             /* REACTION-FIELD ELECTROSTATICS */
901             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
902             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
903
904             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
905
906             /* Update potential sum for this i atom from the interaction with this j atom. */
907             velec            = _mm256_and_pd(velec,cutoff_mask);
908             velec            = _mm256_andnot_pd(dummy_mask,velec);
909             velecsum         = _mm256_add_pd(velecsum,velec);
910
911             fscal            = felec;
912
913             fscal            = _mm256_and_pd(fscal,cutoff_mask);
914
915             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
916
917             /* Calculate temporary vectorial force */
918             tx               = _mm256_mul_pd(fscal,dx11);
919             ty               = _mm256_mul_pd(fscal,dy11);
920             tz               = _mm256_mul_pd(fscal,dz11);
921
922             /* Update vectorial force */
923             fix1             = _mm256_add_pd(fix1,tx);
924             fiy1             = _mm256_add_pd(fiy1,ty);
925             fiz1             = _mm256_add_pd(fiz1,tz);
926
927             fjx1             = _mm256_add_pd(fjx1,tx);
928             fjy1             = _mm256_add_pd(fjy1,ty);
929             fjz1             = _mm256_add_pd(fjz1,tz);
930
931             }
932
933             /**************************
934              * CALCULATE INTERACTIONS *
935              **************************/
936
937             if (gmx_mm256_any_lt(rsq12,rcutoff2))
938             {
939
940             /* REACTION-FIELD ELECTROSTATICS */
941             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
942             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
943
944             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
945
946             /* Update potential sum for this i atom from the interaction with this j atom. */
947             velec            = _mm256_and_pd(velec,cutoff_mask);
948             velec            = _mm256_andnot_pd(dummy_mask,velec);
949             velecsum         = _mm256_add_pd(velecsum,velec);
950
951             fscal            = felec;
952
953             fscal            = _mm256_and_pd(fscal,cutoff_mask);
954
955             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
956
957             /* Calculate temporary vectorial force */
958             tx               = _mm256_mul_pd(fscal,dx12);
959             ty               = _mm256_mul_pd(fscal,dy12);
960             tz               = _mm256_mul_pd(fscal,dz12);
961
962             /* Update vectorial force */
963             fix1             = _mm256_add_pd(fix1,tx);
964             fiy1             = _mm256_add_pd(fiy1,ty);
965             fiz1             = _mm256_add_pd(fiz1,tz);
966
967             fjx2             = _mm256_add_pd(fjx2,tx);
968             fjy2             = _mm256_add_pd(fjy2,ty);
969             fjz2             = _mm256_add_pd(fjz2,tz);
970
971             }
972
973             /**************************
974              * CALCULATE INTERACTIONS *
975              **************************/
976
977             if (gmx_mm256_any_lt(rsq20,rcutoff2))
978             {
979
980             /* REACTION-FIELD ELECTROSTATICS */
981             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
982             felec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
983
984             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
985
986             /* Update potential sum for this i atom from the interaction with this j atom. */
987             velec            = _mm256_and_pd(velec,cutoff_mask);
988             velec            = _mm256_andnot_pd(dummy_mask,velec);
989             velecsum         = _mm256_add_pd(velecsum,velec);
990
991             fscal            = felec;
992
993             fscal            = _mm256_and_pd(fscal,cutoff_mask);
994
995             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
996
997             /* Calculate temporary vectorial force */
998             tx               = _mm256_mul_pd(fscal,dx20);
999             ty               = _mm256_mul_pd(fscal,dy20);
1000             tz               = _mm256_mul_pd(fscal,dz20);
1001
1002             /* Update vectorial force */
1003             fix2             = _mm256_add_pd(fix2,tx);
1004             fiy2             = _mm256_add_pd(fiy2,ty);
1005             fiz2             = _mm256_add_pd(fiz2,tz);
1006
1007             fjx0             = _mm256_add_pd(fjx0,tx);
1008             fjy0             = _mm256_add_pd(fjy0,ty);
1009             fjz0             = _mm256_add_pd(fjz0,tz);
1010
1011             }
1012
1013             /**************************
1014              * CALCULATE INTERACTIONS *
1015              **************************/
1016
1017             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1018             {
1019
1020             /* REACTION-FIELD ELECTROSTATICS */
1021             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1022             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1023
1024             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1025
1026             /* Update potential sum for this i atom from the interaction with this j atom. */
1027             velec            = _mm256_and_pd(velec,cutoff_mask);
1028             velec            = _mm256_andnot_pd(dummy_mask,velec);
1029             velecsum         = _mm256_add_pd(velecsum,velec);
1030
1031             fscal            = felec;
1032
1033             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1034
1035             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1036
1037             /* Calculate temporary vectorial force */
1038             tx               = _mm256_mul_pd(fscal,dx21);
1039             ty               = _mm256_mul_pd(fscal,dy21);
1040             tz               = _mm256_mul_pd(fscal,dz21);
1041
1042             /* Update vectorial force */
1043             fix2             = _mm256_add_pd(fix2,tx);
1044             fiy2             = _mm256_add_pd(fiy2,ty);
1045             fiz2             = _mm256_add_pd(fiz2,tz);
1046
1047             fjx1             = _mm256_add_pd(fjx1,tx);
1048             fjy1             = _mm256_add_pd(fjy1,ty);
1049             fjz1             = _mm256_add_pd(fjz1,tz);
1050
1051             }
1052
1053             /**************************
1054              * CALCULATE INTERACTIONS *
1055              **************************/
1056
1057             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1058             {
1059
1060             /* REACTION-FIELD ELECTROSTATICS */
1061             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1062             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1063
1064             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1065
1066             /* Update potential sum for this i atom from the interaction with this j atom. */
1067             velec            = _mm256_and_pd(velec,cutoff_mask);
1068             velec            = _mm256_andnot_pd(dummy_mask,velec);
1069             velecsum         = _mm256_add_pd(velecsum,velec);
1070
1071             fscal            = felec;
1072
1073             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1074
1075             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1076
1077             /* Calculate temporary vectorial force */
1078             tx               = _mm256_mul_pd(fscal,dx22);
1079             ty               = _mm256_mul_pd(fscal,dy22);
1080             tz               = _mm256_mul_pd(fscal,dz22);
1081
1082             /* Update vectorial force */
1083             fix2             = _mm256_add_pd(fix2,tx);
1084             fiy2             = _mm256_add_pd(fiy2,ty);
1085             fiz2             = _mm256_add_pd(fiz2,tz);
1086
1087             fjx2             = _mm256_add_pd(fjx2,tx);
1088             fjy2             = _mm256_add_pd(fjy2,ty);
1089             fjz2             = _mm256_add_pd(fjz2,tz);
1090
1091             }
1092
1093             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1094             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1095             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1096             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1097
1098             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1099                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1100
1101             /* Inner loop uses 324 flops */
1102         }
1103
1104         /* End of innermost loop */
1105
1106         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1107                                                  f+i_coord_offset,fshift+i_shift_offset);
1108
1109         ggid                        = gid[iidx];
1110         /* Update potential energies */
1111         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1112
1113         /* Increment number of inner iterations */
1114         inneriter                  += j_index_end - j_index_start;
1115
1116         /* Outer loop uses 19 flops */
1117     }
1118
1119     /* Increment number of outer iterations */
1120     outeriter        += nri;
1121
1122     /* Update outer/inner flops */
1123
1124     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*324);
1125 }
1126 /*
1127  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_double
1128  * Electrostatics interaction: ReactionField
1129  * VdW interaction:            None
1130  * Geometry:                   Water3-Water3
1131  * Calculate force/pot:        Force
1132  */
1133 void
1134 nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_avx_256_double
1135                     (t_nblist                    * gmx_restrict       nlist,
1136                      rvec                        * gmx_restrict          xx,
1137                      rvec                        * gmx_restrict          ff,
1138                      struct t_forcerec           * gmx_restrict          fr,
1139                      t_mdatoms                   * gmx_restrict     mdatoms,
1140                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1141                      t_nrnb                      * gmx_restrict        nrnb)
1142 {
1143     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1144      * just 0 for non-waters.
1145      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1146      * jnr indices corresponding to data put in the four positions in the SIMD register.
1147      */
1148     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1149     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1150     int              jnrA,jnrB,jnrC,jnrD;
1151     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1152     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1153     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1154     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1155     real             rcutoff_scalar;
1156     real             *shiftvec,*fshift,*x,*f;
1157     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1158     real             scratch[4*DIM];
1159     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1160     real *           vdwioffsetptr0;
1161     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1162     real *           vdwioffsetptr1;
1163     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1164     real *           vdwioffsetptr2;
1165     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1166     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1167     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1168     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1169     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1170     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1171     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1172     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1173     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1174     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1175     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1176     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1177     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1178     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1179     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1180     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1181     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1182     real             *charge;
1183     __m256d          dummy_mask,cutoff_mask;
1184     __m128           tmpmask0,tmpmask1;
1185     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1186     __m256d          one     = _mm256_set1_pd(1.0);
1187     __m256d          two     = _mm256_set1_pd(2.0);
1188     x                = xx[0];
1189     f                = ff[0];
1190
1191     nri              = nlist->nri;
1192     iinr             = nlist->iinr;
1193     jindex           = nlist->jindex;
1194     jjnr             = nlist->jjnr;
1195     shiftidx         = nlist->shift;
1196     gid              = nlist->gid;
1197     shiftvec         = fr->shift_vec[0];
1198     fshift           = fr->fshift[0];
1199     facel            = _mm256_set1_pd(fr->ic->epsfac);
1200     charge           = mdatoms->chargeA;
1201     krf              = _mm256_set1_pd(fr->ic->k_rf);
1202     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
1203     crf              = _mm256_set1_pd(fr->ic->c_rf);
1204
1205     /* Setup water-specific parameters */
1206     inr              = nlist->iinr[0];
1207     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1208     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1209     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1210
1211     jq0              = _mm256_set1_pd(charge[inr+0]);
1212     jq1              = _mm256_set1_pd(charge[inr+1]);
1213     jq2              = _mm256_set1_pd(charge[inr+2]);
1214     qq00             = _mm256_mul_pd(iq0,jq0);
1215     qq01             = _mm256_mul_pd(iq0,jq1);
1216     qq02             = _mm256_mul_pd(iq0,jq2);
1217     qq10             = _mm256_mul_pd(iq1,jq0);
1218     qq11             = _mm256_mul_pd(iq1,jq1);
1219     qq12             = _mm256_mul_pd(iq1,jq2);
1220     qq20             = _mm256_mul_pd(iq2,jq0);
1221     qq21             = _mm256_mul_pd(iq2,jq1);
1222     qq22             = _mm256_mul_pd(iq2,jq2);
1223
1224     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1225     rcutoff_scalar   = fr->ic->rcoulomb;
1226     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1227     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1228
1229     /* Avoid stupid compiler warnings */
1230     jnrA = jnrB = jnrC = jnrD = 0;
1231     j_coord_offsetA = 0;
1232     j_coord_offsetB = 0;
1233     j_coord_offsetC = 0;
1234     j_coord_offsetD = 0;
1235
1236     outeriter        = 0;
1237     inneriter        = 0;
1238
1239     for(iidx=0;iidx<4*DIM;iidx++)
1240     {
1241         scratch[iidx] = 0.0;
1242     }
1243
1244     /* Start outer loop over neighborlists */
1245     for(iidx=0; iidx<nri; iidx++)
1246     {
1247         /* Load shift vector for this list */
1248         i_shift_offset   = DIM*shiftidx[iidx];
1249
1250         /* Load limits for loop over neighbors */
1251         j_index_start    = jindex[iidx];
1252         j_index_end      = jindex[iidx+1];
1253
1254         /* Get outer coordinate index */
1255         inr              = iinr[iidx];
1256         i_coord_offset   = DIM*inr;
1257
1258         /* Load i particle coords and add shift vector */
1259         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1260                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1261
1262         fix0             = _mm256_setzero_pd();
1263         fiy0             = _mm256_setzero_pd();
1264         fiz0             = _mm256_setzero_pd();
1265         fix1             = _mm256_setzero_pd();
1266         fiy1             = _mm256_setzero_pd();
1267         fiz1             = _mm256_setzero_pd();
1268         fix2             = _mm256_setzero_pd();
1269         fiy2             = _mm256_setzero_pd();
1270         fiz2             = _mm256_setzero_pd();
1271
1272         /* Start inner kernel loop */
1273         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1274         {
1275
1276             /* Get j neighbor index, and coordinate index */
1277             jnrA             = jjnr[jidx];
1278             jnrB             = jjnr[jidx+1];
1279             jnrC             = jjnr[jidx+2];
1280             jnrD             = jjnr[jidx+3];
1281             j_coord_offsetA  = DIM*jnrA;
1282             j_coord_offsetB  = DIM*jnrB;
1283             j_coord_offsetC  = DIM*jnrC;
1284             j_coord_offsetD  = DIM*jnrD;
1285
1286             /* load j atom coordinates */
1287             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1288                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1289                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1290
1291             /* Calculate displacement vector */
1292             dx00             = _mm256_sub_pd(ix0,jx0);
1293             dy00             = _mm256_sub_pd(iy0,jy0);
1294             dz00             = _mm256_sub_pd(iz0,jz0);
1295             dx01             = _mm256_sub_pd(ix0,jx1);
1296             dy01             = _mm256_sub_pd(iy0,jy1);
1297             dz01             = _mm256_sub_pd(iz0,jz1);
1298             dx02             = _mm256_sub_pd(ix0,jx2);
1299             dy02             = _mm256_sub_pd(iy0,jy2);
1300             dz02             = _mm256_sub_pd(iz0,jz2);
1301             dx10             = _mm256_sub_pd(ix1,jx0);
1302             dy10             = _mm256_sub_pd(iy1,jy0);
1303             dz10             = _mm256_sub_pd(iz1,jz0);
1304             dx11             = _mm256_sub_pd(ix1,jx1);
1305             dy11             = _mm256_sub_pd(iy1,jy1);
1306             dz11             = _mm256_sub_pd(iz1,jz1);
1307             dx12             = _mm256_sub_pd(ix1,jx2);
1308             dy12             = _mm256_sub_pd(iy1,jy2);
1309             dz12             = _mm256_sub_pd(iz1,jz2);
1310             dx20             = _mm256_sub_pd(ix2,jx0);
1311             dy20             = _mm256_sub_pd(iy2,jy0);
1312             dz20             = _mm256_sub_pd(iz2,jz0);
1313             dx21             = _mm256_sub_pd(ix2,jx1);
1314             dy21             = _mm256_sub_pd(iy2,jy1);
1315             dz21             = _mm256_sub_pd(iz2,jz1);
1316             dx22             = _mm256_sub_pd(ix2,jx2);
1317             dy22             = _mm256_sub_pd(iy2,jy2);
1318             dz22             = _mm256_sub_pd(iz2,jz2);
1319
1320             /* Calculate squared distance and things based on it */
1321             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1322             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1323             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1324             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1325             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1326             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1327             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1328             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1329             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1330
1331             rinv00           = avx256_invsqrt_d(rsq00);
1332             rinv01           = avx256_invsqrt_d(rsq01);
1333             rinv02           = avx256_invsqrt_d(rsq02);
1334             rinv10           = avx256_invsqrt_d(rsq10);
1335             rinv11           = avx256_invsqrt_d(rsq11);
1336             rinv12           = avx256_invsqrt_d(rsq12);
1337             rinv20           = avx256_invsqrt_d(rsq20);
1338             rinv21           = avx256_invsqrt_d(rsq21);
1339             rinv22           = avx256_invsqrt_d(rsq22);
1340
1341             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1342             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1343             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1344             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1345             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1346             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1347             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1348             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1349             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1350
1351             fjx0             = _mm256_setzero_pd();
1352             fjy0             = _mm256_setzero_pd();
1353             fjz0             = _mm256_setzero_pd();
1354             fjx1             = _mm256_setzero_pd();
1355             fjy1             = _mm256_setzero_pd();
1356             fjz1             = _mm256_setzero_pd();
1357             fjx2             = _mm256_setzero_pd();
1358             fjy2             = _mm256_setzero_pd();
1359             fjz2             = _mm256_setzero_pd();
1360
1361             /**************************
1362              * CALCULATE INTERACTIONS *
1363              **************************/
1364
1365             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1366             {
1367
1368             /* REACTION-FIELD ELECTROSTATICS */
1369             felec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1370
1371             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1372
1373             fscal            = felec;
1374
1375             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1376
1377             /* Calculate temporary vectorial force */
1378             tx               = _mm256_mul_pd(fscal,dx00);
1379             ty               = _mm256_mul_pd(fscal,dy00);
1380             tz               = _mm256_mul_pd(fscal,dz00);
1381
1382             /* Update vectorial force */
1383             fix0             = _mm256_add_pd(fix0,tx);
1384             fiy0             = _mm256_add_pd(fiy0,ty);
1385             fiz0             = _mm256_add_pd(fiz0,tz);
1386
1387             fjx0             = _mm256_add_pd(fjx0,tx);
1388             fjy0             = _mm256_add_pd(fjy0,ty);
1389             fjz0             = _mm256_add_pd(fjz0,tz);
1390
1391             }
1392
1393             /**************************
1394              * CALCULATE INTERACTIONS *
1395              **************************/
1396
1397             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1398             {
1399
1400             /* REACTION-FIELD ELECTROSTATICS */
1401             felec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1402
1403             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1404
1405             fscal            = felec;
1406
1407             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1408
1409             /* Calculate temporary vectorial force */
1410             tx               = _mm256_mul_pd(fscal,dx01);
1411             ty               = _mm256_mul_pd(fscal,dy01);
1412             tz               = _mm256_mul_pd(fscal,dz01);
1413
1414             /* Update vectorial force */
1415             fix0             = _mm256_add_pd(fix0,tx);
1416             fiy0             = _mm256_add_pd(fiy0,ty);
1417             fiz0             = _mm256_add_pd(fiz0,tz);
1418
1419             fjx1             = _mm256_add_pd(fjx1,tx);
1420             fjy1             = _mm256_add_pd(fjy1,ty);
1421             fjz1             = _mm256_add_pd(fjz1,tz);
1422
1423             }
1424
1425             /**************************
1426              * CALCULATE INTERACTIONS *
1427              **************************/
1428
1429             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1430             {
1431
1432             /* REACTION-FIELD ELECTROSTATICS */
1433             felec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1434
1435             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1436
1437             fscal            = felec;
1438
1439             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1440
1441             /* Calculate temporary vectorial force */
1442             tx               = _mm256_mul_pd(fscal,dx02);
1443             ty               = _mm256_mul_pd(fscal,dy02);
1444             tz               = _mm256_mul_pd(fscal,dz02);
1445
1446             /* Update vectorial force */
1447             fix0             = _mm256_add_pd(fix0,tx);
1448             fiy0             = _mm256_add_pd(fiy0,ty);
1449             fiz0             = _mm256_add_pd(fiz0,tz);
1450
1451             fjx2             = _mm256_add_pd(fjx2,tx);
1452             fjy2             = _mm256_add_pd(fjy2,ty);
1453             fjz2             = _mm256_add_pd(fjz2,tz);
1454
1455             }
1456
1457             /**************************
1458              * CALCULATE INTERACTIONS *
1459              **************************/
1460
1461             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1462             {
1463
1464             /* REACTION-FIELD ELECTROSTATICS */
1465             felec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1466
1467             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1468
1469             fscal            = felec;
1470
1471             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1472
1473             /* Calculate temporary vectorial force */
1474             tx               = _mm256_mul_pd(fscal,dx10);
1475             ty               = _mm256_mul_pd(fscal,dy10);
1476             tz               = _mm256_mul_pd(fscal,dz10);
1477
1478             /* Update vectorial force */
1479             fix1             = _mm256_add_pd(fix1,tx);
1480             fiy1             = _mm256_add_pd(fiy1,ty);
1481             fiz1             = _mm256_add_pd(fiz1,tz);
1482
1483             fjx0             = _mm256_add_pd(fjx0,tx);
1484             fjy0             = _mm256_add_pd(fjy0,ty);
1485             fjz0             = _mm256_add_pd(fjz0,tz);
1486
1487             }
1488
1489             /**************************
1490              * CALCULATE INTERACTIONS *
1491              **************************/
1492
1493             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1494             {
1495
1496             /* REACTION-FIELD ELECTROSTATICS */
1497             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1498
1499             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1500
1501             fscal            = felec;
1502
1503             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1504
1505             /* Calculate temporary vectorial force */
1506             tx               = _mm256_mul_pd(fscal,dx11);
1507             ty               = _mm256_mul_pd(fscal,dy11);
1508             tz               = _mm256_mul_pd(fscal,dz11);
1509
1510             /* Update vectorial force */
1511             fix1             = _mm256_add_pd(fix1,tx);
1512             fiy1             = _mm256_add_pd(fiy1,ty);
1513             fiz1             = _mm256_add_pd(fiz1,tz);
1514
1515             fjx1             = _mm256_add_pd(fjx1,tx);
1516             fjy1             = _mm256_add_pd(fjy1,ty);
1517             fjz1             = _mm256_add_pd(fjz1,tz);
1518
1519             }
1520
1521             /**************************
1522              * CALCULATE INTERACTIONS *
1523              **************************/
1524
1525             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1526             {
1527
1528             /* REACTION-FIELD ELECTROSTATICS */
1529             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1530
1531             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1532
1533             fscal            = felec;
1534
1535             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1536
1537             /* Calculate temporary vectorial force */
1538             tx               = _mm256_mul_pd(fscal,dx12);
1539             ty               = _mm256_mul_pd(fscal,dy12);
1540             tz               = _mm256_mul_pd(fscal,dz12);
1541
1542             /* Update vectorial force */
1543             fix1             = _mm256_add_pd(fix1,tx);
1544             fiy1             = _mm256_add_pd(fiy1,ty);
1545             fiz1             = _mm256_add_pd(fiz1,tz);
1546
1547             fjx2             = _mm256_add_pd(fjx2,tx);
1548             fjy2             = _mm256_add_pd(fjy2,ty);
1549             fjz2             = _mm256_add_pd(fjz2,tz);
1550
1551             }
1552
1553             /**************************
1554              * CALCULATE INTERACTIONS *
1555              **************************/
1556
1557             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1558             {
1559
1560             /* REACTION-FIELD ELECTROSTATICS */
1561             felec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1562
1563             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1564
1565             fscal            = felec;
1566
1567             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1568
1569             /* Calculate temporary vectorial force */
1570             tx               = _mm256_mul_pd(fscal,dx20);
1571             ty               = _mm256_mul_pd(fscal,dy20);
1572             tz               = _mm256_mul_pd(fscal,dz20);
1573
1574             /* Update vectorial force */
1575             fix2             = _mm256_add_pd(fix2,tx);
1576             fiy2             = _mm256_add_pd(fiy2,ty);
1577             fiz2             = _mm256_add_pd(fiz2,tz);
1578
1579             fjx0             = _mm256_add_pd(fjx0,tx);
1580             fjy0             = _mm256_add_pd(fjy0,ty);
1581             fjz0             = _mm256_add_pd(fjz0,tz);
1582
1583             }
1584
1585             /**************************
1586              * CALCULATE INTERACTIONS *
1587              **************************/
1588
1589             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1590             {
1591
1592             /* REACTION-FIELD ELECTROSTATICS */
1593             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1594
1595             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1596
1597             fscal            = felec;
1598
1599             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1600
1601             /* Calculate temporary vectorial force */
1602             tx               = _mm256_mul_pd(fscal,dx21);
1603             ty               = _mm256_mul_pd(fscal,dy21);
1604             tz               = _mm256_mul_pd(fscal,dz21);
1605
1606             /* Update vectorial force */
1607             fix2             = _mm256_add_pd(fix2,tx);
1608             fiy2             = _mm256_add_pd(fiy2,ty);
1609             fiz2             = _mm256_add_pd(fiz2,tz);
1610
1611             fjx1             = _mm256_add_pd(fjx1,tx);
1612             fjy1             = _mm256_add_pd(fjy1,ty);
1613             fjz1             = _mm256_add_pd(fjz1,tz);
1614
1615             }
1616
1617             /**************************
1618              * CALCULATE INTERACTIONS *
1619              **************************/
1620
1621             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1622             {
1623
1624             /* REACTION-FIELD ELECTROSTATICS */
1625             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1626
1627             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1628
1629             fscal            = felec;
1630
1631             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1632
1633             /* Calculate temporary vectorial force */
1634             tx               = _mm256_mul_pd(fscal,dx22);
1635             ty               = _mm256_mul_pd(fscal,dy22);
1636             tz               = _mm256_mul_pd(fscal,dz22);
1637
1638             /* Update vectorial force */
1639             fix2             = _mm256_add_pd(fix2,tx);
1640             fiy2             = _mm256_add_pd(fiy2,ty);
1641             fiz2             = _mm256_add_pd(fiz2,tz);
1642
1643             fjx2             = _mm256_add_pd(fjx2,tx);
1644             fjy2             = _mm256_add_pd(fjy2,ty);
1645             fjz2             = _mm256_add_pd(fjz2,tz);
1646
1647             }
1648
1649             fjptrA             = f+j_coord_offsetA;
1650             fjptrB             = f+j_coord_offsetB;
1651             fjptrC             = f+j_coord_offsetC;
1652             fjptrD             = f+j_coord_offsetD;
1653
1654             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1655                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1656
1657             /* Inner loop uses 270 flops */
1658         }
1659
1660         if(jidx<j_index_end)
1661         {
1662
1663             /* Get j neighbor index, and coordinate index */
1664             jnrlistA         = jjnr[jidx];
1665             jnrlistB         = jjnr[jidx+1];
1666             jnrlistC         = jjnr[jidx+2];
1667             jnrlistD         = jjnr[jidx+3];
1668             /* Sign of each element will be negative for non-real atoms.
1669              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1670              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1671              */
1672             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1673
1674             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1675             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1676             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1677
1678             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1679             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1680             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1681             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1682             j_coord_offsetA  = DIM*jnrA;
1683             j_coord_offsetB  = DIM*jnrB;
1684             j_coord_offsetC  = DIM*jnrC;
1685             j_coord_offsetD  = DIM*jnrD;
1686
1687             /* load j atom coordinates */
1688             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1689                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1690                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1691
1692             /* Calculate displacement vector */
1693             dx00             = _mm256_sub_pd(ix0,jx0);
1694             dy00             = _mm256_sub_pd(iy0,jy0);
1695             dz00             = _mm256_sub_pd(iz0,jz0);
1696             dx01             = _mm256_sub_pd(ix0,jx1);
1697             dy01             = _mm256_sub_pd(iy0,jy1);
1698             dz01             = _mm256_sub_pd(iz0,jz1);
1699             dx02             = _mm256_sub_pd(ix0,jx2);
1700             dy02             = _mm256_sub_pd(iy0,jy2);
1701             dz02             = _mm256_sub_pd(iz0,jz2);
1702             dx10             = _mm256_sub_pd(ix1,jx0);
1703             dy10             = _mm256_sub_pd(iy1,jy0);
1704             dz10             = _mm256_sub_pd(iz1,jz0);
1705             dx11             = _mm256_sub_pd(ix1,jx1);
1706             dy11             = _mm256_sub_pd(iy1,jy1);
1707             dz11             = _mm256_sub_pd(iz1,jz1);
1708             dx12             = _mm256_sub_pd(ix1,jx2);
1709             dy12             = _mm256_sub_pd(iy1,jy2);
1710             dz12             = _mm256_sub_pd(iz1,jz2);
1711             dx20             = _mm256_sub_pd(ix2,jx0);
1712             dy20             = _mm256_sub_pd(iy2,jy0);
1713             dz20             = _mm256_sub_pd(iz2,jz0);
1714             dx21             = _mm256_sub_pd(ix2,jx1);
1715             dy21             = _mm256_sub_pd(iy2,jy1);
1716             dz21             = _mm256_sub_pd(iz2,jz1);
1717             dx22             = _mm256_sub_pd(ix2,jx2);
1718             dy22             = _mm256_sub_pd(iy2,jy2);
1719             dz22             = _mm256_sub_pd(iz2,jz2);
1720
1721             /* Calculate squared distance and things based on it */
1722             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1723             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1724             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1725             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1726             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1727             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1728             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1729             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1730             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1731
1732             rinv00           = avx256_invsqrt_d(rsq00);
1733             rinv01           = avx256_invsqrt_d(rsq01);
1734             rinv02           = avx256_invsqrt_d(rsq02);
1735             rinv10           = avx256_invsqrt_d(rsq10);
1736             rinv11           = avx256_invsqrt_d(rsq11);
1737             rinv12           = avx256_invsqrt_d(rsq12);
1738             rinv20           = avx256_invsqrt_d(rsq20);
1739             rinv21           = avx256_invsqrt_d(rsq21);
1740             rinv22           = avx256_invsqrt_d(rsq22);
1741
1742             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1743             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1744             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1745             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1746             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1747             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1748             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1749             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1750             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1751
1752             fjx0             = _mm256_setzero_pd();
1753             fjy0             = _mm256_setzero_pd();
1754             fjz0             = _mm256_setzero_pd();
1755             fjx1             = _mm256_setzero_pd();
1756             fjy1             = _mm256_setzero_pd();
1757             fjz1             = _mm256_setzero_pd();
1758             fjx2             = _mm256_setzero_pd();
1759             fjy2             = _mm256_setzero_pd();
1760             fjz2             = _mm256_setzero_pd();
1761
1762             /**************************
1763              * CALCULATE INTERACTIONS *
1764              **************************/
1765
1766             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1767             {
1768
1769             /* REACTION-FIELD ELECTROSTATICS */
1770             felec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1771
1772             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1773
1774             fscal            = felec;
1775
1776             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1777
1778             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1779
1780             /* Calculate temporary vectorial force */
1781             tx               = _mm256_mul_pd(fscal,dx00);
1782             ty               = _mm256_mul_pd(fscal,dy00);
1783             tz               = _mm256_mul_pd(fscal,dz00);
1784
1785             /* Update vectorial force */
1786             fix0             = _mm256_add_pd(fix0,tx);
1787             fiy0             = _mm256_add_pd(fiy0,ty);
1788             fiz0             = _mm256_add_pd(fiz0,tz);
1789
1790             fjx0             = _mm256_add_pd(fjx0,tx);
1791             fjy0             = _mm256_add_pd(fjy0,ty);
1792             fjz0             = _mm256_add_pd(fjz0,tz);
1793
1794             }
1795
1796             /**************************
1797              * CALCULATE INTERACTIONS *
1798              **************************/
1799
1800             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1801             {
1802
1803             /* REACTION-FIELD ELECTROSTATICS */
1804             felec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1805
1806             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1807
1808             fscal            = felec;
1809
1810             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1811
1812             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1813
1814             /* Calculate temporary vectorial force */
1815             tx               = _mm256_mul_pd(fscal,dx01);
1816             ty               = _mm256_mul_pd(fscal,dy01);
1817             tz               = _mm256_mul_pd(fscal,dz01);
1818
1819             /* Update vectorial force */
1820             fix0             = _mm256_add_pd(fix0,tx);
1821             fiy0             = _mm256_add_pd(fiy0,ty);
1822             fiz0             = _mm256_add_pd(fiz0,tz);
1823
1824             fjx1             = _mm256_add_pd(fjx1,tx);
1825             fjy1             = _mm256_add_pd(fjy1,ty);
1826             fjz1             = _mm256_add_pd(fjz1,tz);
1827
1828             }
1829
1830             /**************************
1831              * CALCULATE INTERACTIONS *
1832              **************************/
1833
1834             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1835             {
1836
1837             /* REACTION-FIELD ELECTROSTATICS */
1838             felec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1839
1840             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1841
1842             fscal            = felec;
1843
1844             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1845
1846             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1847
1848             /* Calculate temporary vectorial force */
1849             tx               = _mm256_mul_pd(fscal,dx02);
1850             ty               = _mm256_mul_pd(fscal,dy02);
1851             tz               = _mm256_mul_pd(fscal,dz02);
1852
1853             /* Update vectorial force */
1854             fix0             = _mm256_add_pd(fix0,tx);
1855             fiy0             = _mm256_add_pd(fiy0,ty);
1856             fiz0             = _mm256_add_pd(fiz0,tz);
1857
1858             fjx2             = _mm256_add_pd(fjx2,tx);
1859             fjy2             = _mm256_add_pd(fjy2,ty);
1860             fjz2             = _mm256_add_pd(fjz2,tz);
1861
1862             }
1863
1864             /**************************
1865              * CALCULATE INTERACTIONS *
1866              **************************/
1867
1868             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1869             {
1870
1871             /* REACTION-FIELD ELECTROSTATICS */
1872             felec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1873
1874             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1875
1876             fscal            = felec;
1877
1878             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1879
1880             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1881
1882             /* Calculate temporary vectorial force */
1883             tx               = _mm256_mul_pd(fscal,dx10);
1884             ty               = _mm256_mul_pd(fscal,dy10);
1885             tz               = _mm256_mul_pd(fscal,dz10);
1886
1887             /* Update vectorial force */
1888             fix1             = _mm256_add_pd(fix1,tx);
1889             fiy1             = _mm256_add_pd(fiy1,ty);
1890             fiz1             = _mm256_add_pd(fiz1,tz);
1891
1892             fjx0             = _mm256_add_pd(fjx0,tx);
1893             fjy0             = _mm256_add_pd(fjy0,ty);
1894             fjz0             = _mm256_add_pd(fjz0,tz);
1895
1896             }
1897
1898             /**************************
1899              * CALCULATE INTERACTIONS *
1900              **************************/
1901
1902             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1903             {
1904
1905             /* REACTION-FIELD ELECTROSTATICS */
1906             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1907
1908             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1909
1910             fscal            = felec;
1911
1912             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1913
1914             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1915
1916             /* Calculate temporary vectorial force */
1917             tx               = _mm256_mul_pd(fscal,dx11);
1918             ty               = _mm256_mul_pd(fscal,dy11);
1919             tz               = _mm256_mul_pd(fscal,dz11);
1920
1921             /* Update vectorial force */
1922             fix1             = _mm256_add_pd(fix1,tx);
1923             fiy1             = _mm256_add_pd(fiy1,ty);
1924             fiz1             = _mm256_add_pd(fiz1,tz);
1925
1926             fjx1             = _mm256_add_pd(fjx1,tx);
1927             fjy1             = _mm256_add_pd(fjy1,ty);
1928             fjz1             = _mm256_add_pd(fjz1,tz);
1929
1930             }
1931
1932             /**************************
1933              * CALCULATE INTERACTIONS *
1934              **************************/
1935
1936             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1937             {
1938
1939             /* REACTION-FIELD ELECTROSTATICS */
1940             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1941
1942             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1943
1944             fscal            = felec;
1945
1946             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1947
1948             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1949
1950             /* Calculate temporary vectorial force */
1951             tx               = _mm256_mul_pd(fscal,dx12);
1952             ty               = _mm256_mul_pd(fscal,dy12);
1953             tz               = _mm256_mul_pd(fscal,dz12);
1954
1955             /* Update vectorial force */
1956             fix1             = _mm256_add_pd(fix1,tx);
1957             fiy1             = _mm256_add_pd(fiy1,ty);
1958             fiz1             = _mm256_add_pd(fiz1,tz);
1959
1960             fjx2             = _mm256_add_pd(fjx2,tx);
1961             fjy2             = _mm256_add_pd(fjy2,ty);
1962             fjz2             = _mm256_add_pd(fjz2,tz);
1963
1964             }
1965
1966             /**************************
1967              * CALCULATE INTERACTIONS *
1968              **************************/
1969
1970             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1971             {
1972
1973             /* REACTION-FIELD ELECTROSTATICS */
1974             felec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1975
1976             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1977
1978             fscal            = felec;
1979
1980             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1981
1982             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1983
1984             /* Calculate temporary vectorial force */
1985             tx               = _mm256_mul_pd(fscal,dx20);
1986             ty               = _mm256_mul_pd(fscal,dy20);
1987             tz               = _mm256_mul_pd(fscal,dz20);
1988
1989             /* Update vectorial force */
1990             fix2             = _mm256_add_pd(fix2,tx);
1991             fiy2             = _mm256_add_pd(fiy2,ty);
1992             fiz2             = _mm256_add_pd(fiz2,tz);
1993
1994             fjx0             = _mm256_add_pd(fjx0,tx);
1995             fjy0             = _mm256_add_pd(fjy0,ty);
1996             fjz0             = _mm256_add_pd(fjz0,tz);
1997
1998             }
1999
2000             /**************************
2001              * CALCULATE INTERACTIONS *
2002              **************************/
2003
2004             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2005             {
2006
2007             /* REACTION-FIELD ELECTROSTATICS */
2008             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2009
2010             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2011
2012             fscal            = felec;
2013
2014             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2015
2016             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2017
2018             /* Calculate temporary vectorial force */
2019             tx               = _mm256_mul_pd(fscal,dx21);
2020             ty               = _mm256_mul_pd(fscal,dy21);
2021             tz               = _mm256_mul_pd(fscal,dz21);
2022
2023             /* Update vectorial force */
2024             fix2             = _mm256_add_pd(fix2,tx);
2025             fiy2             = _mm256_add_pd(fiy2,ty);
2026             fiz2             = _mm256_add_pd(fiz2,tz);
2027
2028             fjx1             = _mm256_add_pd(fjx1,tx);
2029             fjy1             = _mm256_add_pd(fjy1,ty);
2030             fjz1             = _mm256_add_pd(fjz1,tz);
2031
2032             }
2033
2034             /**************************
2035              * CALCULATE INTERACTIONS *
2036              **************************/
2037
2038             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2039             {
2040
2041             /* REACTION-FIELD ELECTROSTATICS */
2042             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2043
2044             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2045
2046             fscal            = felec;
2047
2048             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2049
2050             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2051
2052             /* Calculate temporary vectorial force */
2053             tx               = _mm256_mul_pd(fscal,dx22);
2054             ty               = _mm256_mul_pd(fscal,dy22);
2055             tz               = _mm256_mul_pd(fscal,dz22);
2056
2057             /* Update vectorial force */
2058             fix2             = _mm256_add_pd(fix2,tx);
2059             fiy2             = _mm256_add_pd(fiy2,ty);
2060             fiz2             = _mm256_add_pd(fiz2,tz);
2061
2062             fjx2             = _mm256_add_pd(fjx2,tx);
2063             fjy2             = _mm256_add_pd(fjy2,ty);
2064             fjz2             = _mm256_add_pd(fjz2,tz);
2065
2066             }
2067
2068             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2069             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2070             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2071             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2072
2073             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2074                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2075
2076             /* Inner loop uses 270 flops */
2077         }
2078
2079         /* End of innermost loop */
2080
2081         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2082                                                  f+i_coord_offset,fshift+i_shift_offset);
2083
2084         /* Increment number of inner iterations */
2085         inneriter                  += j_index_end - j_index_start;
2086
2087         /* Outer loop uses 18 flops */
2088     }
2089
2090     /* Increment number of outer iterations */
2091     outeriter        += nri;
2092
2093     /* Update outer/inner flops */
2094
2095     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
2096 }