8e39f6e9f37c0a330ca09da13e1ff64566944a9e
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_avx_128_fma_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_128_fma_double
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_128_fma_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B;
75     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B;
77     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
94     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
95     __m128d          dummy_mask,cutoff_mask;
96     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
97     __m128d          one     = _mm_set1_pd(1.0);
98     __m128d          two     = _mm_set1_pd(2.0);
99     x                = xx[0];
100     f                = ff[0];
101
102     nri              = nlist->nri;
103     iinr             = nlist->iinr;
104     jindex           = nlist->jindex;
105     jjnr             = nlist->jjnr;
106     shiftidx         = nlist->shift;
107     gid              = nlist->gid;
108     shiftvec         = fr->shift_vec[0];
109     fshift           = fr->fshift[0];
110     facel            = _mm_set1_pd(fr->epsfac);
111     charge           = mdatoms->chargeA;
112     krf              = _mm_set1_pd(fr->ic->k_rf);
113     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
114     crf              = _mm_set1_pd(fr->ic->c_rf);
115     nvdwtype         = fr->ntype;
116     vdwparam         = fr->nbfp;
117     vdwtype          = mdatoms->typeA;
118
119     /* Setup water-specific parameters */
120     inr              = nlist->iinr[0];
121     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
122     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
123     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
124     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
125
126     jq0              = _mm_set1_pd(charge[inr+0]);
127     jq1              = _mm_set1_pd(charge[inr+1]);
128     jq2              = _mm_set1_pd(charge[inr+2]);
129     vdwjidx0A        = 2*vdwtype[inr+0];
130     qq00             = _mm_mul_pd(iq0,jq0);
131     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
132     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
133     qq01             = _mm_mul_pd(iq0,jq1);
134     qq02             = _mm_mul_pd(iq0,jq2);
135     qq10             = _mm_mul_pd(iq1,jq0);
136     qq11             = _mm_mul_pd(iq1,jq1);
137     qq12             = _mm_mul_pd(iq1,jq2);
138     qq20             = _mm_mul_pd(iq2,jq0);
139     qq21             = _mm_mul_pd(iq2,jq1);
140     qq22             = _mm_mul_pd(iq2,jq2);
141
142     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
143     rcutoff_scalar   = fr->rcoulomb;
144     rcutoff          = _mm_set1_pd(rcutoff_scalar);
145     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
146
147     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
148     rvdw             = _mm_set1_pd(fr->rvdw);
149
150     /* Avoid stupid compiler warnings */
151     jnrA = jnrB = 0;
152     j_coord_offsetA = 0;
153     j_coord_offsetB = 0;
154
155     outeriter        = 0;
156     inneriter        = 0;
157
158     /* Start outer loop over neighborlists */
159     for(iidx=0; iidx<nri; iidx++)
160     {
161         /* Load shift vector for this list */
162         i_shift_offset   = DIM*shiftidx[iidx];
163
164         /* Load limits for loop over neighbors */
165         j_index_start    = jindex[iidx];
166         j_index_end      = jindex[iidx+1];
167
168         /* Get outer coordinate index */
169         inr              = iinr[iidx];
170         i_coord_offset   = DIM*inr;
171
172         /* Load i particle coords and add shift vector */
173         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
174                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
175
176         fix0             = _mm_setzero_pd();
177         fiy0             = _mm_setzero_pd();
178         fiz0             = _mm_setzero_pd();
179         fix1             = _mm_setzero_pd();
180         fiy1             = _mm_setzero_pd();
181         fiz1             = _mm_setzero_pd();
182         fix2             = _mm_setzero_pd();
183         fiy2             = _mm_setzero_pd();
184         fiz2             = _mm_setzero_pd();
185
186         /* Reset potential sums */
187         velecsum         = _mm_setzero_pd();
188         vvdwsum          = _mm_setzero_pd();
189
190         /* Start inner kernel loop */
191         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
192         {
193
194             /* Get j neighbor index, and coordinate index */
195             jnrA             = jjnr[jidx];
196             jnrB             = jjnr[jidx+1];
197             j_coord_offsetA  = DIM*jnrA;
198             j_coord_offsetB  = DIM*jnrB;
199
200             /* load j atom coordinates */
201             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
202                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
203
204             /* Calculate displacement vector */
205             dx00             = _mm_sub_pd(ix0,jx0);
206             dy00             = _mm_sub_pd(iy0,jy0);
207             dz00             = _mm_sub_pd(iz0,jz0);
208             dx01             = _mm_sub_pd(ix0,jx1);
209             dy01             = _mm_sub_pd(iy0,jy1);
210             dz01             = _mm_sub_pd(iz0,jz1);
211             dx02             = _mm_sub_pd(ix0,jx2);
212             dy02             = _mm_sub_pd(iy0,jy2);
213             dz02             = _mm_sub_pd(iz0,jz2);
214             dx10             = _mm_sub_pd(ix1,jx0);
215             dy10             = _mm_sub_pd(iy1,jy0);
216             dz10             = _mm_sub_pd(iz1,jz0);
217             dx11             = _mm_sub_pd(ix1,jx1);
218             dy11             = _mm_sub_pd(iy1,jy1);
219             dz11             = _mm_sub_pd(iz1,jz1);
220             dx12             = _mm_sub_pd(ix1,jx2);
221             dy12             = _mm_sub_pd(iy1,jy2);
222             dz12             = _mm_sub_pd(iz1,jz2);
223             dx20             = _mm_sub_pd(ix2,jx0);
224             dy20             = _mm_sub_pd(iy2,jy0);
225             dz20             = _mm_sub_pd(iz2,jz0);
226             dx21             = _mm_sub_pd(ix2,jx1);
227             dy21             = _mm_sub_pd(iy2,jy1);
228             dz21             = _mm_sub_pd(iz2,jz1);
229             dx22             = _mm_sub_pd(ix2,jx2);
230             dy22             = _mm_sub_pd(iy2,jy2);
231             dz22             = _mm_sub_pd(iz2,jz2);
232
233             /* Calculate squared distance and things based on it */
234             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
235             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
236             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
237             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
238             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
239             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
240             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
241             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
242             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
243
244             rinv00           = gmx_mm_invsqrt_pd(rsq00);
245             rinv01           = gmx_mm_invsqrt_pd(rsq01);
246             rinv02           = gmx_mm_invsqrt_pd(rsq02);
247             rinv10           = gmx_mm_invsqrt_pd(rsq10);
248             rinv11           = gmx_mm_invsqrt_pd(rsq11);
249             rinv12           = gmx_mm_invsqrt_pd(rsq12);
250             rinv20           = gmx_mm_invsqrt_pd(rsq20);
251             rinv21           = gmx_mm_invsqrt_pd(rsq21);
252             rinv22           = gmx_mm_invsqrt_pd(rsq22);
253
254             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
255             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
256             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
257             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
258             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
259             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
260             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
261             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
262             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
263
264             fjx0             = _mm_setzero_pd();
265             fjy0             = _mm_setzero_pd();
266             fjz0             = _mm_setzero_pd();
267             fjx1             = _mm_setzero_pd();
268             fjy1             = _mm_setzero_pd();
269             fjz1             = _mm_setzero_pd();
270             fjx2             = _mm_setzero_pd();
271             fjy2             = _mm_setzero_pd();
272             fjz2             = _mm_setzero_pd();
273
274             /**************************
275              * CALCULATE INTERACTIONS *
276              **************************/
277
278             if (gmx_mm_any_lt(rsq00,rcutoff2))
279             {
280
281             /* REACTION-FIELD ELECTROSTATICS */
282             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_macc_pd(krf,rsq00,rinv00),crf));
283             felec            = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
284
285             /* LENNARD-JONES DISPERSION/REPULSION */
286
287             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
288             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
289             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
290             vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
291                                            _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
292             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
293
294             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
295
296             /* Update potential sum for this i atom from the interaction with this j atom. */
297             velec            = _mm_and_pd(velec,cutoff_mask);
298             velecsum         = _mm_add_pd(velecsum,velec);
299             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
300             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
301
302             fscal            = _mm_add_pd(felec,fvdw);
303
304             fscal            = _mm_and_pd(fscal,cutoff_mask);
305
306             /* Update vectorial force */
307             fix0             = _mm_macc_pd(dx00,fscal,fix0);
308             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
309             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
310             
311             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
312             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
313             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
314
315             }
316
317             /**************************
318              * CALCULATE INTERACTIONS *
319              **************************/
320
321             if (gmx_mm_any_lt(rsq01,rcutoff2))
322             {
323
324             /* REACTION-FIELD ELECTROSTATICS */
325             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_macc_pd(krf,rsq01,rinv01),crf));
326             felec            = _mm_mul_pd(qq01,_mm_msub_pd(rinv01,rinvsq01,krf2));
327
328             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
329
330             /* Update potential sum for this i atom from the interaction with this j atom. */
331             velec            = _mm_and_pd(velec,cutoff_mask);
332             velecsum         = _mm_add_pd(velecsum,velec);
333
334             fscal            = felec;
335
336             fscal            = _mm_and_pd(fscal,cutoff_mask);
337
338             /* Update vectorial force */
339             fix0             = _mm_macc_pd(dx01,fscal,fix0);
340             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
341             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
342             
343             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
344             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
345             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
346
347             }
348
349             /**************************
350              * CALCULATE INTERACTIONS *
351              **************************/
352
353             if (gmx_mm_any_lt(rsq02,rcutoff2))
354             {
355
356             /* REACTION-FIELD ELECTROSTATICS */
357             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_macc_pd(krf,rsq02,rinv02),crf));
358             felec            = _mm_mul_pd(qq02,_mm_msub_pd(rinv02,rinvsq02,krf2));
359
360             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
361
362             /* Update potential sum for this i atom from the interaction with this j atom. */
363             velec            = _mm_and_pd(velec,cutoff_mask);
364             velecsum         = _mm_add_pd(velecsum,velec);
365
366             fscal            = felec;
367
368             fscal            = _mm_and_pd(fscal,cutoff_mask);
369
370             /* Update vectorial force */
371             fix0             = _mm_macc_pd(dx02,fscal,fix0);
372             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
373             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
374             
375             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
376             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
377             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
378
379             }
380
381             /**************************
382              * CALCULATE INTERACTIONS *
383              **************************/
384
385             if (gmx_mm_any_lt(rsq10,rcutoff2))
386             {
387
388             /* REACTION-FIELD ELECTROSTATICS */
389             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_macc_pd(krf,rsq10,rinv10),crf));
390             felec            = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
391
392             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
393
394             /* Update potential sum for this i atom from the interaction with this j atom. */
395             velec            = _mm_and_pd(velec,cutoff_mask);
396             velecsum         = _mm_add_pd(velecsum,velec);
397
398             fscal            = felec;
399
400             fscal            = _mm_and_pd(fscal,cutoff_mask);
401
402             /* Update vectorial force */
403             fix1             = _mm_macc_pd(dx10,fscal,fix1);
404             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
405             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
406             
407             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
408             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
409             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
410
411             }
412
413             /**************************
414              * CALCULATE INTERACTIONS *
415              **************************/
416
417             if (gmx_mm_any_lt(rsq11,rcutoff2))
418             {
419
420             /* REACTION-FIELD ELECTROSTATICS */
421             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
422             felec            = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
423
424             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
425
426             /* Update potential sum for this i atom from the interaction with this j atom. */
427             velec            = _mm_and_pd(velec,cutoff_mask);
428             velecsum         = _mm_add_pd(velecsum,velec);
429
430             fscal            = felec;
431
432             fscal            = _mm_and_pd(fscal,cutoff_mask);
433
434             /* Update vectorial force */
435             fix1             = _mm_macc_pd(dx11,fscal,fix1);
436             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
437             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
438             
439             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
440             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
441             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
442
443             }
444
445             /**************************
446              * CALCULATE INTERACTIONS *
447              **************************/
448
449             if (gmx_mm_any_lt(rsq12,rcutoff2))
450             {
451
452             /* REACTION-FIELD ELECTROSTATICS */
453             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
454             felec            = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
455
456             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
457
458             /* Update potential sum for this i atom from the interaction with this j atom. */
459             velec            = _mm_and_pd(velec,cutoff_mask);
460             velecsum         = _mm_add_pd(velecsum,velec);
461
462             fscal            = felec;
463
464             fscal            = _mm_and_pd(fscal,cutoff_mask);
465
466             /* Update vectorial force */
467             fix1             = _mm_macc_pd(dx12,fscal,fix1);
468             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
469             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
470             
471             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
472             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
473             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
474
475             }
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             if (gmx_mm_any_lt(rsq20,rcutoff2))
482             {
483
484             /* REACTION-FIELD ELECTROSTATICS */
485             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_macc_pd(krf,rsq20,rinv20),crf));
486             felec            = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
487
488             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
489
490             /* Update potential sum for this i atom from the interaction with this j atom. */
491             velec            = _mm_and_pd(velec,cutoff_mask);
492             velecsum         = _mm_add_pd(velecsum,velec);
493
494             fscal            = felec;
495
496             fscal            = _mm_and_pd(fscal,cutoff_mask);
497
498             /* Update vectorial force */
499             fix2             = _mm_macc_pd(dx20,fscal,fix2);
500             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
501             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
502             
503             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
504             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
505             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
506
507             }
508
509             /**************************
510              * CALCULATE INTERACTIONS *
511              **************************/
512
513             if (gmx_mm_any_lt(rsq21,rcutoff2))
514             {
515
516             /* REACTION-FIELD ELECTROSTATICS */
517             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
518             felec            = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
519
520             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
521
522             /* Update potential sum for this i atom from the interaction with this j atom. */
523             velec            = _mm_and_pd(velec,cutoff_mask);
524             velecsum         = _mm_add_pd(velecsum,velec);
525
526             fscal            = felec;
527
528             fscal            = _mm_and_pd(fscal,cutoff_mask);
529
530             /* Update vectorial force */
531             fix2             = _mm_macc_pd(dx21,fscal,fix2);
532             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
533             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
534             
535             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
536             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
537             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
538
539             }
540
541             /**************************
542              * CALCULATE INTERACTIONS *
543              **************************/
544
545             if (gmx_mm_any_lt(rsq22,rcutoff2))
546             {
547
548             /* REACTION-FIELD ELECTROSTATICS */
549             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
550             felec            = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
551
552             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
553
554             /* Update potential sum for this i atom from the interaction with this j atom. */
555             velec            = _mm_and_pd(velec,cutoff_mask);
556             velecsum         = _mm_add_pd(velecsum,velec);
557
558             fscal            = felec;
559
560             fscal            = _mm_and_pd(fscal,cutoff_mask);
561
562             /* Update vectorial force */
563             fix2             = _mm_macc_pd(dx22,fscal,fix2);
564             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
565             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
566             
567             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
568             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
569             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
570
571             }
572
573             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
574
575             /* Inner loop uses 369 flops */
576         }
577
578         if(jidx<j_index_end)
579         {
580
581             jnrA             = jjnr[jidx];
582             j_coord_offsetA  = DIM*jnrA;
583
584             /* load j atom coordinates */
585             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
586                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
587
588             /* Calculate displacement vector */
589             dx00             = _mm_sub_pd(ix0,jx0);
590             dy00             = _mm_sub_pd(iy0,jy0);
591             dz00             = _mm_sub_pd(iz0,jz0);
592             dx01             = _mm_sub_pd(ix0,jx1);
593             dy01             = _mm_sub_pd(iy0,jy1);
594             dz01             = _mm_sub_pd(iz0,jz1);
595             dx02             = _mm_sub_pd(ix0,jx2);
596             dy02             = _mm_sub_pd(iy0,jy2);
597             dz02             = _mm_sub_pd(iz0,jz2);
598             dx10             = _mm_sub_pd(ix1,jx0);
599             dy10             = _mm_sub_pd(iy1,jy0);
600             dz10             = _mm_sub_pd(iz1,jz0);
601             dx11             = _mm_sub_pd(ix1,jx1);
602             dy11             = _mm_sub_pd(iy1,jy1);
603             dz11             = _mm_sub_pd(iz1,jz1);
604             dx12             = _mm_sub_pd(ix1,jx2);
605             dy12             = _mm_sub_pd(iy1,jy2);
606             dz12             = _mm_sub_pd(iz1,jz2);
607             dx20             = _mm_sub_pd(ix2,jx0);
608             dy20             = _mm_sub_pd(iy2,jy0);
609             dz20             = _mm_sub_pd(iz2,jz0);
610             dx21             = _mm_sub_pd(ix2,jx1);
611             dy21             = _mm_sub_pd(iy2,jy1);
612             dz21             = _mm_sub_pd(iz2,jz1);
613             dx22             = _mm_sub_pd(ix2,jx2);
614             dy22             = _mm_sub_pd(iy2,jy2);
615             dz22             = _mm_sub_pd(iz2,jz2);
616
617             /* Calculate squared distance and things based on it */
618             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
619             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
620             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
621             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
622             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
623             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
624             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
625             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
626             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
627
628             rinv00           = gmx_mm_invsqrt_pd(rsq00);
629             rinv01           = gmx_mm_invsqrt_pd(rsq01);
630             rinv02           = gmx_mm_invsqrt_pd(rsq02);
631             rinv10           = gmx_mm_invsqrt_pd(rsq10);
632             rinv11           = gmx_mm_invsqrt_pd(rsq11);
633             rinv12           = gmx_mm_invsqrt_pd(rsq12);
634             rinv20           = gmx_mm_invsqrt_pd(rsq20);
635             rinv21           = gmx_mm_invsqrt_pd(rsq21);
636             rinv22           = gmx_mm_invsqrt_pd(rsq22);
637
638             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
639             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
640             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
641             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
642             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
643             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
644             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
645             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
646             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
647
648             fjx0             = _mm_setzero_pd();
649             fjy0             = _mm_setzero_pd();
650             fjz0             = _mm_setzero_pd();
651             fjx1             = _mm_setzero_pd();
652             fjy1             = _mm_setzero_pd();
653             fjz1             = _mm_setzero_pd();
654             fjx2             = _mm_setzero_pd();
655             fjy2             = _mm_setzero_pd();
656             fjz2             = _mm_setzero_pd();
657
658             /**************************
659              * CALCULATE INTERACTIONS *
660              **************************/
661
662             if (gmx_mm_any_lt(rsq00,rcutoff2))
663             {
664
665             /* REACTION-FIELD ELECTROSTATICS */
666             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_macc_pd(krf,rsq00,rinv00),crf));
667             felec            = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
668
669             /* LENNARD-JONES DISPERSION/REPULSION */
670
671             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
672             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
673             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
674             vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
675                                            _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
676             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
677
678             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
679
680             /* Update potential sum for this i atom from the interaction with this j atom. */
681             velec            = _mm_and_pd(velec,cutoff_mask);
682             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
683             velecsum         = _mm_add_pd(velecsum,velec);
684             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
685             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
686             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
687
688             fscal            = _mm_add_pd(felec,fvdw);
689
690             fscal            = _mm_and_pd(fscal,cutoff_mask);
691
692             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
693
694             /* Update vectorial force */
695             fix0             = _mm_macc_pd(dx00,fscal,fix0);
696             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
697             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
698             
699             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
700             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
701             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
702
703             }
704
705             /**************************
706              * CALCULATE INTERACTIONS *
707              **************************/
708
709             if (gmx_mm_any_lt(rsq01,rcutoff2))
710             {
711
712             /* REACTION-FIELD ELECTROSTATICS */
713             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_macc_pd(krf,rsq01,rinv01),crf));
714             felec            = _mm_mul_pd(qq01,_mm_msub_pd(rinv01,rinvsq01,krf2));
715
716             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
717
718             /* Update potential sum for this i atom from the interaction with this j atom. */
719             velec            = _mm_and_pd(velec,cutoff_mask);
720             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
721             velecsum         = _mm_add_pd(velecsum,velec);
722
723             fscal            = felec;
724
725             fscal            = _mm_and_pd(fscal,cutoff_mask);
726
727             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
728
729             /* Update vectorial force */
730             fix0             = _mm_macc_pd(dx01,fscal,fix0);
731             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
732             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
733             
734             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
735             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
736             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
737
738             }
739
740             /**************************
741              * CALCULATE INTERACTIONS *
742              **************************/
743
744             if (gmx_mm_any_lt(rsq02,rcutoff2))
745             {
746
747             /* REACTION-FIELD ELECTROSTATICS */
748             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_macc_pd(krf,rsq02,rinv02),crf));
749             felec            = _mm_mul_pd(qq02,_mm_msub_pd(rinv02,rinvsq02,krf2));
750
751             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
752
753             /* Update potential sum for this i atom from the interaction with this j atom. */
754             velec            = _mm_and_pd(velec,cutoff_mask);
755             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
756             velecsum         = _mm_add_pd(velecsum,velec);
757
758             fscal            = felec;
759
760             fscal            = _mm_and_pd(fscal,cutoff_mask);
761
762             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
763
764             /* Update vectorial force */
765             fix0             = _mm_macc_pd(dx02,fscal,fix0);
766             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
767             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
768             
769             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
770             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
771             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
772
773             }
774
775             /**************************
776              * CALCULATE INTERACTIONS *
777              **************************/
778
779             if (gmx_mm_any_lt(rsq10,rcutoff2))
780             {
781
782             /* REACTION-FIELD ELECTROSTATICS */
783             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_macc_pd(krf,rsq10,rinv10),crf));
784             felec            = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
785
786             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
787
788             /* Update potential sum for this i atom from the interaction with this j atom. */
789             velec            = _mm_and_pd(velec,cutoff_mask);
790             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
791             velecsum         = _mm_add_pd(velecsum,velec);
792
793             fscal            = felec;
794
795             fscal            = _mm_and_pd(fscal,cutoff_mask);
796
797             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
798
799             /* Update vectorial force */
800             fix1             = _mm_macc_pd(dx10,fscal,fix1);
801             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
802             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
803             
804             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
805             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
806             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
807
808             }
809
810             /**************************
811              * CALCULATE INTERACTIONS *
812              **************************/
813
814             if (gmx_mm_any_lt(rsq11,rcutoff2))
815             {
816
817             /* REACTION-FIELD ELECTROSTATICS */
818             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
819             felec            = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
820
821             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
822
823             /* Update potential sum for this i atom from the interaction with this j atom. */
824             velec            = _mm_and_pd(velec,cutoff_mask);
825             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
826             velecsum         = _mm_add_pd(velecsum,velec);
827
828             fscal            = felec;
829
830             fscal            = _mm_and_pd(fscal,cutoff_mask);
831
832             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
833
834             /* Update vectorial force */
835             fix1             = _mm_macc_pd(dx11,fscal,fix1);
836             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
837             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
838             
839             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
840             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
841             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
842
843             }
844
845             /**************************
846              * CALCULATE INTERACTIONS *
847              **************************/
848
849             if (gmx_mm_any_lt(rsq12,rcutoff2))
850             {
851
852             /* REACTION-FIELD ELECTROSTATICS */
853             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
854             felec            = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
855
856             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
857
858             /* Update potential sum for this i atom from the interaction with this j atom. */
859             velec            = _mm_and_pd(velec,cutoff_mask);
860             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
861             velecsum         = _mm_add_pd(velecsum,velec);
862
863             fscal            = felec;
864
865             fscal            = _mm_and_pd(fscal,cutoff_mask);
866
867             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
868
869             /* Update vectorial force */
870             fix1             = _mm_macc_pd(dx12,fscal,fix1);
871             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
872             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
873             
874             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
875             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
876             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
877
878             }
879
880             /**************************
881              * CALCULATE INTERACTIONS *
882              **************************/
883
884             if (gmx_mm_any_lt(rsq20,rcutoff2))
885             {
886
887             /* REACTION-FIELD ELECTROSTATICS */
888             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_macc_pd(krf,rsq20,rinv20),crf));
889             felec            = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
890
891             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
892
893             /* Update potential sum for this i atom from the interaction with this j atom. */
894             velec            = _mm_and_pd(velec,cutoff_mask);
895             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
896             velecsum         = _mm_add_pd(velecsum,velec);
897
898             fscal            = felec;
899
900             fscal            = _mm_and_pd(fscal,cutoff_mask);
901
902             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
903
904             /* Update vectorial force */
905             fix2             = _mm_macc_pd(dx20,fscal,fix2);
906             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
907             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
908             
909             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
910             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
911             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
912
913             }
914
915             /**************************
916              * CALCULATE INTERACTIONS *
917              **************************/
918
919             if (gmx_mm_any_lt(rsq21,rcutoff2))
920             {
921
922             /* REACTION-FIELD ELECTROSTATICS */
923             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
924             felec            = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
925
926             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
927
928             /* Update potential sum for this i atom from the interaction with this j atom. */
929             velec            = _mm_and_pd(velec,cutoff_mask);
930             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
931             velecsum         = _mm_add_pd(velecsum,velec);
932
933             fscal            = felec;
934
935             fscal            = _mm_and_pd(fscal,cutoff_mask);
936
937             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
938
939             /* Update vectorial force */
940             fix2             = _mm_macc_pd(dx21,fscal,fix2);
941             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
942             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
943             
944             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
945             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
946             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
947
948             }
949
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             if (gmx_mm_any_lt(rsq22,rcutoff2))
955             {
956
957             /* REACTION-FIELD ELECTROSTATICS */
958             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
959             felec            = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
960
961             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
962
963             /* Update potential sum for this i atom from the interaction with this j atom. */
964             velec            = _mm_and_pd(velec,cutoff_mask);
965             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
966             velecsum         = _mm_add_pd(velecsum,velec);
967
968             fscal            = felec;
969
970             fscal            = _mm_and_pd(fscal,cutoff_mask);
971
972             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
973
974             /* Update vectorial force */
975             fix2             = _mm_macc_pd(dx22,fscal,fix2);
976             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
977             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
978             
979             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
980             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
981             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
982
983             }
984
985             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
986
987             /* Inner loop uses 369 flops */
988         }
989
990         /* End of innermost loop */
991
992         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
993                                               f+i_coord_offset,fshift+i_shift_offset);
994
995         ggid                        = gid[iidx];
996         /* Update potential energies */
997         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
998         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
999
1000         /* Increment number of inner iterations */
1001         inneriter                  += j_index_end - j_index_start;
1002
1003         /* Outer loop uses 20 flops */
1004     }
1005
1006     /* Increment number of outer iterations */
1007     outeriter        += nri;
1008
1009     /* Update outer/inner flops */
1010
1011     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*369);
1012 }
1013 /*
1014  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_128_fma_double
1015  * Electrostatics interaction: ReactionField
1016  * VdW interaction:            LennardJones
1017  * Geometry:                   Water3-Water3
1018  * Calculate force/pot:        Force
1019  */
1020 void
1021 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_128_fma_double
1022                     (t_nblist * gmx_restrict                nlist,
1023                      rvec * gmx_restrict                    xx,
1024                      rvec * gmx_restrict                    ff,
1025                      t_forcerec * gmx_restrict              fr,
1026                      t_mdatoms * gmx_restrict               mdatoms,
1027                      nb_kernel_data_t * gmx_restrict        kernel_data,
1028                      t_nrnb * gmx_restrict                  nrnb)
1029 {
1030     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1031      * just 0 for non-waters.
1032      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1033      * jnr indices corresponding to data put in the four positions in the SIMD register.
1034      */
1035     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1036     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1037     int              jnrA,jnrB;
1038     int              j_coord_offsetA,j_coord_offsetB;
1039     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1040     real             rcutoff_scalar;
1041     real             *shiftvec,*fshift,*x,*f;
1042     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1043     int              vdwioffset0;
1044     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1045     int              vdwioffset1;
1046     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1047     int              vdwioffset2;
1048     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1049     int              vdwjidx0A,vdwjidx0B;
1050     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1051     int              vdwjidx1A,vdwjidx1B;
1052     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1053     int              vdwjidx2A,vdwjidx2B;
1054     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1055     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1056     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1057     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1058     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1059     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1060     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1061     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1062     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1063     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1064     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1065     real             *charge;
1066     int              nvdwtype;
1067     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1068     int              *vdwtype;
1069     real             *vdwparam;
1070     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1071     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1072     __m128d          dummy_mask,cutoff_mask;
1073     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1074     __m128d          one     = _mm_set1_pd(1.0);
1075     __m128d          two     = _mm_set1_pd(2.0);
1076     x                = xx[0];
1077     f                = ff[0];
1078
1079     nri              = nlist->nri;
1080     iinr             = nlist->iinr;
1081     jindex           = nlist->jindex;
1082     jjnr             = nlist->jjnr;
1083     shiftidx         = nlist->shift;
1084     gid              = nlist->gid;
1085     shiftvec         = fr->shift_vec[0];
1086     fshift           = fr->fshift[0];
1087     facel            = _mm_set1_pd(fr->epsfac);
1088     charge           = mdatoms->chargeA;
1089     krf              = _mm_set1_pd(fr->ic->k_rf);
1090     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
1091     crf              = _mm_set1_pd(fr->ic->c_rf);
1092     nvdwtype         = fr->ntype;
1093     vdwparam         = fr->nbfp;
1094     vdwtype          = mdatoms->typeA;
1095
1096     /* Setup water-specific parameters */
1097     inr              = nlist->iinr[0];
1098     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1099     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1100     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1101     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1102
1103     jq0              = _mm_set1_pd(charge[inr+0]);
1104     jq1              = _mm_set1_pd(charge[inr+1]);
1105     jq2              = _mm_set1_pd(charge[inr+2]);
1106     vdwjidx0A        = 2*vdwtype[inr+0];
1107     qq00             = _mm_mul_pd(iq0,jq0);
1108     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1109     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1110     qq01             = _mm_mul_pd(iq0,jq1);
1111     qq02             = _mm_mul_pd(iq0,jq2);
1112     qq10             = _mm_mul_pd(iq1,jq0);
1113     qq11             = _mm_mul_pd(iq1,jq1);
1114     qq12             = _mm_mul_pd(iq1,jq2);
1115     qq20             = _mm_mul_pd(iq2,jq0);
1116     qq21             = _mm_mul_pd(iq2,jq1);
1117     qq22             = _mm_mul_pd(iq2,jq2);
1118
1119     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1120     rcutoff_scalar   = fr->rcoulomb;
1121     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1122     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1123
1124     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
1125     rvdw             = _mm_set1_pd(fr->rvdw);
1126
1127     /* Avoid stupid compiler warnings */
1128     jnrA = jnrB = 0;
1129     j_coord_offsetA = 0;
1130     j_coord_offsetB = 0;
1131
1132     outeriter        = 0;
1133     inneriter        = 0;
1134
1135     /* Start outer loop over neighborlists */
1136     for(iidx=0; iidx<nri; iidx++)
1137     {
1138         /* Load shift vector for this list */
1139         i_shift_offset   = DIM*shiftidx[iidx];
1140
1141         /* Load limits for loop over neighbors */
1142         j_index_start    = jindex[iidx];
1143         j_index_end      = jindex[iidx+1];
1144
1145         /* Get outer coordinate index */
1146         inr              = iinr[iidx];
1147         i_coord_offset   = DIM*inr;
1148
1149         /* Load i particle coords and add shift vector */
1150         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1151                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1152
1153         fix0             = _mm_setzero_pd();
1154         fiy0             = _mm_setzero_pd();
1155         fiz0             = _mm_setzero_pd();
1156         fix1             = _mm_setzero_pd();
1157         fiy1             = _mm_setzero_pd();
1158         fiz1             = _mm_setzero_pd();
1159         fix2             = _mm_setzero_pd();
1160         fiy2             = _mm_setzero_pd();
1161         fiz2             = _mm_setzero_pd();
1162
1163         /* Start inner kernel loop */
1164         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1165         {
1166
1167             /* Get j neighbor index, and coordinate index */
1168             jnrA             = jjnr[jidx];
1169             jnrB             = jjnr[jidx+1];
1170             j_coord_offsetA  = DIM*jnrA;
1171             j_coord_offsetB  = DIM*jnrB;
1172
1173             /* load j atom coordinates */
1174             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1175                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1176
1177             /* Calculate displacement vector */
1178             dx00             = _mm_sub_pd(ix0,jx0);
1179             dy00             = _mm_sub_pd(iy0,jy0);
1180             dz00             = _mm_sub_pd(iz0,jz0);
1181             dx01             = _mm_sub_pd(ix0,jx1);
1182             dy01             = _mm_sub_pd(iy0,jy1);
1183             dz01             = _mm_sub_pd(iz0,jz1);
1184             dx02             = _mm_sub_pd(ix0,jx2);
1185             dy02             = _mm_sub_pd(iy0,jy2);
1186             dz02             = _mm_sub_pd(iz0,jz2);
1187             dx10             = _mm_sub_pd(ix1,jx0);
1188             dy10             = _mm_sub_pd(iy1,jy0);
1189             dz10             = _mm_sub_pd(iz1,jz0);
1190             dx11             = _mm_sub_pd(ix1,jx1);
1191             dy11             = _mm_sub_pd(iy1,jy1);
1192             dz11             = _mm_sub_pd(iz1,jz1);
1193             dx12             = _mm_sub_pd(ix1,jx2);
1194             dy12             = _mm_sub_pd(iy1,jy2);
1195             dz12             = _mm_sub_pd(iz1,jz2);
1196             dx20             = _mm_sub_pd(ix2,jx0);
1197             dy20             = _mm_sub_pd(iy2,jy0);
1198             dz20             = _mm_sub_pd(iz2,jz0);
1199             dx21             = _mm_sub_pd(ix2,jx1);
1200             dy21             = _mm_sub_pd(iy2,jy1);
1201             dz21             = _mm_sub_pd(iz2,jz1);
1202             dx22             = _mm_sub_pd(ix2,jx2);
1203             dy22             = _mm_sub_pd(iy2,jy2);
1204             dz22             = _mm_sub_pd(iz2,jz2);
1205
1206             /* Calculate squared distance and things based on it */
1207             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1208             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1209             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1210             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1211             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1212             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1213             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1214             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1215             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1216
1217             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1218             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1219             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1220             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1221             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1222             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1223             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1224             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1225             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1226
1227             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1228             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1229             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1230             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1231             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1232             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1233             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1234             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1235             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1236
1237             fjx0             = _mm_setzero_pd();
1238             fjy0             = _mm_setzero_pd();
1239             fjz0             = _mm_setzero_pd();
1240             fjx1             = _mm_setzero_pd();
1241             fjy1             = _mm_setzero_pd();
1242             fjz1             = _mm_setzero_pd();
1243             fjx2             = _mm_setzero_pd();
1244             fjy2             = _mm_setzero_pd();
1245             fjz2             = _mm_setzero_pd();
1246
1247             /**************************
1248              * CALCULATE INTERACTIONS *
1249              **************************/
1250
1251             if (gmx_mm_any_lt(rsq00,rcutoff2))
1252             {
1253
1254             /* REACTION-FIELD ELECTROSTATICS */
1255             felec            = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
1256
1257             /* LENNARD-JONES DISPERSION/REPULSION */
1258
1259             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1260             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1261
1262             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1263
1264             fscal            = _mm_add_pd(felec,fvdw);
1265
1266             fscal            = _mm_and_pd(fscal,cutoff_mask);
1267
1268             /* Update vectorial force */
1269             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1270             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1271             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1272             
1273             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1274             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1275             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1276
1277             }
1278
1279             /**************************
1280              * CALCULATE INTERACTIONS *
1281              **************************/
1282
1283             if (gmx_mm_any_lt(rsq01,rcutoff2))
1284             {
1285
1286             /* REACTION-FIELD ELECTROSTATICS */
1287             felec            = _mm_mul_pd(qq01,_mm_msub_pd(rinv01,rinvsq01,krf2));
1288
1289             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1290
1291             fscal            = felec;
1292
1293             fscal            = _mm_and_pd(fscal,cutoff_mask);
1294
1295             /* Update vectorial force */
1296             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1297             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1298             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1299             
1300             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1301             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1302             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1303
1304             }
1305
1306             /**************************
1307              * CALCULATE INTERACTIONS *
1308              **************************/
1309
1310             if (gmx_mm_any_lt(rsq02,rcutoff2))
1311             {
1312
1313             /* REACTION-FIELD ELECTROSTATICS */
1314             felec            = _mm_mul_pd(qq02,_mm_msub_pd(rinv02,rinvsq02,krf2));
1315
1316             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1317
1318             fscal            = felec;
1319
1320             fscal            = _mm_and_pd(fscal,cutoff_mask);
1321
1322             /* Update vectorial force */
1323             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1324             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1325             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1326             
1327             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1328             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1329             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1330
1331             }
1332
1333             /**************************
1334              * CALCULATE INTERACTIONS *
1335              **************************/
1336
1337             if (gmx_mm_any_lt(rsq10,rcutoff2))
1338             {
1339
1340             /* REACTION-FIELD ELECTROSTATICS */
1341             felec            = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
1342
1343             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1344
1345             fscal            = felec;
1346
1347             fscal            = _mm_and_pd(fscal,cutoff_mask);
1348
1349             /* Update vectorial force */
1350             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1351             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1352             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1353             
1354             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1355             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1356             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1357
1358             }
1359
1360             /**************************
1361              * CALCULATE INTERACTIONS *
1362              **************************/
1363
1364             if (gmx_mm_any_lt(rsq11,rcutoff2))
1365             {
1366
1367             /* REACTION-FIELD ELECTROSTATICS */
1368             felec            = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1369
1370             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1371
1372             fscal            = felec;
1373
1374             fscal            = _mm_and_pd(fscal,cutoff_mask);
1375
1376             /* Update vectorial force */
1377             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1378             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1379             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1380             
1381             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1382             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1383             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1384
1385             }
1386
1387             /**************************
1388              * CALCULATE INTERACTIONS *
1389              **************************/
1390
1391             if (gmx_mm_any_lt(rsq12,rcutoff2))
1392             {
1393
1394             /* REACTION-FIELD ELECTROSTATICS */
1395             felec            = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1396
1397             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1398
1399             fscal            = felec;
1400
1401             fscal            = _mm_and_pd(fscal,cutoff_mask);
1402
1403             /* Update vectorial force */
1404             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1405             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1406             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1407             
1408             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1409             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1410             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1411
1412             }
1413
1414             /**************************
1415              * CALCULATE INTERACTIONS *
1416              **************************/
1417
1418             if (gmx_mm_any_lt(rsq20,rcutoff2))
1419             {
1420
1421             /* REACTION-FIELD ELECTROSTATICS */
1422             felec            = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
1423
1424             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1425
1426             fscal            = felec;
1427
1428             fscal            = _mm_and_pd(fscal,cutoff_mask);
1429
1430             /* Update vectorial force */
1431             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1432             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1433             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1434             
1435             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1436             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1437             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1438
1439             }
1440
1441             /**************************
1442              * CALCULATE INTERACTIONS *
1443              **************************/
1444
1445             if (gmx_mm_any_lt(rsq21,rcutoff2))
1446             {
1447
1448             /* REACTION-FIELD ELECTROSTATICS */
1449             felec            = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1450
1451             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1452
1453             fscal            = felec;
1454
1455             fscal            = _mm_and_pd(fscal,cutoff_mask);
1456
1457             /* Update vectorial force */
1458             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1459             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1460             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1461             
1462             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1463             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1464             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1465
1466             }
1467
1468             /**************************
1469              * CALCULATE INTERACTIONS *
1470              **************************/
1471
1472             if (gmx_mm_any_lt(rsq22,rcutoff2))
1473             {
1474
1475             /* REACTION-FIELD ELECTROSTATICS */
1476             felec            = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1477
1478             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1479
1480             fscal            = felec;
1481
1482             fscal            = _mm_and_pd(fscal,cutoff_mask);
1483
1484             /* Update vectorial force */
1485             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1486             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1487             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1488             
1489             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1490             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1491             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1492
1493             }
1494
1495             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1496
1497             /* Inner loop uses 304 flops */
1498         }
1499
1500         if(jidx<j_index_end)
1501         {
1502
1503             jnrA             = jjnr[jidx];
1504             j_coord_offsetA  = DIM*jnrA;
1505
1506             /* load j atom coordinates */
1507             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1508                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1509
1510             /* Calculate displacement vector */
1511             dx00             = _mm_sub_pd(ix0,jx0);
1512             dy00             = _mm_sub_pd(iy0,jy0);
1513             dz00             = _mm_sub_pd(iz0,jz0);
1514             dx01             = _mm_sub_pd(ix0,jx1);
1515             dy01             = _mm_sub_pd(iy0,jy1);
1516             dz01             = _mm_sub_pd(iz0,jz1);
1517             dx02             = _mm_sub_pd(ix0,jx2);
1518             dy02             = _mm_sub_pd(iy0,jy2);
1519             dz02             = _mm_sub_pd(iz0,jz2);
1520             dx10             = _mm_sub_pd(ix1,jx0);
1521             dy10             = _mm_sub_pd(iy1,jy0);
1522             dz10             = _mm_sub_pd(iz1,jz0);
1523             dx11             = _mm_sub_pd(ix1,jx1);
1524             dy11             = _mm_sub_pd(iy1,jy1);
1525             dz11             = _mm_sub_pd(iz1,jz1);
1526             dx12             = _mm_sub_pd(ix1,jx2);
1527             dy12             = _mm_sub_pd(iy1,jy2);
1528             dz12             = _mm_sub_pd(iz1,jz2);
1529             dx20             = _mm_sub_pd(ix2,jx0);
1530             dy20             = _mm_sub_pd(iy2,jy0);
1531             dz20             = _mm_sub_pd(iz2,jz0);
1532             dx21             = _mm_sub_pd(ix2,jx1);
1533             dy21             = _mm_sub_pd(iy2,jy1);
1534             dz21             = _mm_sub_pd(iz2,jz1);
1535             dx22             = _mm_sub_pd(ix2,jx2);
1536             dy22             = _mm_sub_pd(iy2,jy2);
1537             dz22             = _mm_sub_pd(iz2,jz2);
1538
1539             /* Calculate squared distance and things based on it */
1540             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1541             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1542             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1543             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1544             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1545             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1546             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1547             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1548             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1549
1550             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1551             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1552             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1553             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1554             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1555             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1556             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1557             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1558             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1559
1560             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1561             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1562             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1563             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1564             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1565             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1566             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1567             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1568             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1569
1570             fjx0             = _mm_setzero_pd();
1571             fjy0             = _mm_setzero_pd();
1572             fjz0             = _mm_setzero_pd();
1573             fjx1             = _mm_setzero_pd();
1574             fjy1             = _mm_setzero_pd();
1575             fjz1             = _mm_setzero_pd();
1576             fjx2             = _mm_setzero_pd();
1577             fjy2             = _mm_setzero_pd();
1578             fjz2             = _mm_setzero_pd();
1579
1580             /**************************
1581              * CALCULATE INTERACTIONS *
1582              **************************/
1583
1584             if (gmx_mm_any_lt(rsq00,rcutoff2))
1585             {
1586
1587             /* REACTION-FIELD ELECTROSTATICS */
1588             felec            = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
1589
1590             /* LENNARD-JONES DISPERSION/REPULSION */
1591
1592             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1593             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1594
1595             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1596
1597             fscal            = _mm_add_pd(felec,fvdw);
1598
1599             fscal            = _mm_and_pd(fscal,cutoff_mask);
1600
1601             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1602
1603             /* Update vectorial force */
1604             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1605             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1606             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1607             
1608             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1609             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1610             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1611
1612             }
1613
1614             /**************************
1615              * CALCULATE INTERACTIONS *
1616              **************************/
1617
1618             if (gmx_mm_any_lt(rsq01,rcutoff2))
1619             {
1620
1621             /* REACTION-FIELD ELECTROSTATICS */
1622             felec            = _mm_mul_pd(qq01,_mm_msub_pd(rinv01,rinvsq01,krf2));
1623
1624             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1625
1626             fscal            = felec;
1627
1628             fscal            = _mm_and_pd(fscal,cutoff_mask);
1629
1630             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1631
1632             /* Update vectorial force */
1633             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1634             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1635             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1636             
1637             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1638             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1639             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1640
1641             }
1642
1643             /**************************
1644              * CALCULATE INTERACTIONS *
1645              **************************/
1646
1647             if (gmx_mm_any_lt(rsq02,rcutoff2))
1648             {
1649
1650             /* REACTION-FIELD ELECTROSTATICS */
1651             felec            = _mm_mul_pd(qq02,_mm_msub_pd(rinv02,rinvsq02,krf2));
1652
1653             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1654
1655             fscal            = felec;
1656
1657             fscal            = _mm_and_pd(fscal,cutoff_mask);
1658
1659             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1660
1661             /* Update vectorial force */
1662             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1663             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1664             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1665             
1666             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1667             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1668             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1669
1670             }
1671
1672             /**************************
1673              * CALCULATE INTERACTIONS *
1674              **************************/
1675
1676             if (gmx_mm_any_lt(rsq10,rcutoff2))
1677             {
1678
1679             /* REACTION-FIELD ELECTROSTATICS */
1680             felec            = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
1681
1682             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1683
1684             fscal            = felec;
1685
1686             fscal            = _mm_and_pd(fscal,cutoff_mask);
1687
1688             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1689
1690             /* Update vectorial force */
1691             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1692             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1693             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1694             
1695             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1696             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1697             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1698
1699             }
1700
1701             /**************************
1702              * CALCULATE INTERACTIONS *
1703              **************************/
1704
1705             if (gmx_mm_any_lt(rsq11,rcutoff2))
1706             {
1707
1708             /* REACTION-FIELD ELECTROSTATICS */
1709             felec            = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1710
1711             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1712
1713             fscal            = felec;
1714
1715             fscal            = _mm_and_pd(fscal,cutoff_mask);
1716
1717             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1718
1719             /* Update vectorial force */
1720             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1721             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1722             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1723             
1724             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1725             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1726             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1727
1728             }
1729
1730             /**************************
1731              * CALCULATE INTERACTIONS *
1732              **************************/
1733
1734             if (gmx_mm_any_lt(rsq12,rcutoff2))
1735             {
1736
1737             /* REACTION-FIELD ELECTROSTATICS */
1738             felec            = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1739
1740             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1741
1742             fscal            = felec;
1743
1744             fscal            = _mm_and_pd(fscal,cutoff_mask);
1745
1746             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1747
1748             /* Update vectorial force */
1749             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1750             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1751             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1752             
1753             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1754             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1755             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1756
1757             }
1758
1759             /**************************
1760              * CALCULATE INTERACTIONS *
1761              **************************/
1762
1763             if (gmx_mm_any_lt(rsq20,rcutoff2))
1764             {
1765
1766             /* REACTION-FIELD ELECTROSTATICS */
1767             felec            = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
1768
1769             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1770
1771             fscal            = felec;
1772
1773             fscal            = _mm_and_pd(fscal,cutoff_mask);
1774
1775             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1776
1777             /* Update vectorial force */
1778             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1779             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1780             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1781             
1782             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1783             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1784             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1785
1786             }
1787
1788             /**************************
1789              * CALCULATE INTERACTIONS *
1790              **************************/
1791
1792             if (gmx_mm_any_lt(rsq21,rcutoff2))
1793             {
1794
1795             /* REACTION-FIELD ELECTROSTATICS */
1796             felec            = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1797
1798             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1799
1800             fscal            = felec;
1801
1802             fscal            = _mm_and_pd(fscal,cutoff_mask);
1803
1804             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1805
1806             /* Update vectorial force */
1807             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1808             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1809             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1810             
1811             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1812             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1813             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1814
1815             }
1816
1817             /**************************
1818              * CALCULATE INTERACTIONS *
1819              **************************/
1820
1821             if (gmx_mm_any_lt(rsq22,rcutoff2))
1822             {
1823
1824             /* REACTION-FIELD ELECTROSTATICS */
1825             felec            = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1826
1827             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1828
1829             fscal            = felec;
1830
1831             fscal            = _mm_and_pd(fscal,cutoff_mask);
1832
1833             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1834
1835             /* Update vectorial force */
1836             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1837             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1838             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1839             
1840             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1841             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1842             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1843
1844             }
1845
1846             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1847
1848             /* Inner loop uses 304 flops */
1849         }
1850
1851         /* End of innermost loop */
1852
1853         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1854                                               f+i_coord_offset,fshift+i_shift_offset);
1855
1856         /* Increment number of inner iterations */
1857         inneriter                  += j_index_end - j_index_start;
1858
1859         /* Outer loop uses 18 flops */
1860     }
1861
1862     /* Increment number of outer iterations */
1863     outeriter        += nri;
1864
1865     /* Update outer/inner flops */
1866
1867     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*304);
1868 }