Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            CubicSplineTable
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwjidx0A,vdwjidx0B;
86     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87     int              vdwjidx1A,vdwjidx1B;
88     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89     int              vdwjidx2A,vdwjidx2B;
90     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
101     real             *charge;
102     int              nvdwtype;
103     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104     int              *vdwtype;
105     real             *vdwparam;
106     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
107     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
108     __m128i          vfitab;
109     __m128i          ifour       = _mm_set1_epi32(4);
110     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
111     real             *vftab;
112     __m128d          dummy_mask,cutoff_mask;
113     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
114     __m128d          one     = _mm_set1_pd(1.0);
115     __m128d          two     = _mm_set1_pd(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm_set1_pd(fr->ic->epsfac);
128     charge           = mdatoms->chargeA;
129     krf              = _mm_set1_pd(fr->ic->k_rf);
130     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
131     crf              = _mm_set1_pd(fr->ic->c_rf);
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     vftab            = kernel_data->table_vdw->data;
137     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
138
139     /* Setup water-specific parameters */
140     inr              = nlist->iinr[0];
141     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
142     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
145
146     jq0              = _mm_set1_pd(charge[inr+0]);
147     jq1              = _mm_set1_pd(charge[inr+1]);
148     jq2              = _mm_set1_pd(charge[inr+2]);
149     vdwjidx0A        = 2*vdwtype[inr+0];
150     qq00             = _mm_mul_pd(iq0,jq0);
151     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153     qq01             = _mm_mul_pd(iq0,jq1);
154     qq02             = _mm_mul_pd(iq0,jq2);
155     qq10             = _mm_mul_pd(iq1,jq0);
156     qq11             = _mm_mul_pd(iq1,jq1);
157     qq12             = _mm_mul_pd(iq1,jq2);
158     qq20             = _mm_mul_pd(iq2,jq0);
159     qq21             = _mm_mul_pd(iq2,jq1);
160     qq22             = _mm_mul_pd(iq2,jq2);
161
162     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163     rcutoff_scalar   = fr->ic->rcoulomb;
164     rcutoff          = _mm_set1_pd(rcutoff_scalar);
165     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
166
167     /* Avoid stupid compiler warnings */
168     jnrA = jnrB = 0;
169     j_coord_offsetA = 0;
170     j_coord_offsetB = 0;
171
172     outeriter        = 0;
173     inneriter        = 0;
174
175     /* Start outer loop over neighborlists */
176     for(iidx=0; iidx<nri; iidx++)
177     {
178         /* Load shift vector for this list */
179         i_shift_offset   = DIM*shiftidx[iidx];
180
181         /* Load limits for loop over neighbors */
182         j_index_start    = jindex[iidx];
183         j_index_end      = jindex[iidx+1];
184
185         /* Get outer coordinate index */
186         inr              = iinr[iidx];
187         i_coord_offset   = DIM*inr;
188
189         /* Load i particle coords and add shift vector */
190         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
191                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
192
193         fix0             = _mm_setzero_pd();
194         fiy0             = _mm_setzero_pd();
195         fiz0             = _mm_setzero_pd();
196         fix1             = _mm_setzero_pd();
197         fiy1             = _mm_setzero_pd();
198         fiz1             = _mm_setzero_pd();
199         fix2             = _mm_setzero_pd();
200         fiy2             = _mm_setzero_pd();
201         fiz2             = _mm_setzero_pd();
202
203         /* Reset potential sums */
204         velecsum         = _mm_setzero_pd();
205         vvdwsum          = _mm_setzero_pd();
206
207         /* Start inner kernel loop */
208         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
209         {
210
211             /* Get j neighbor index, and coordinate index */
212             jnrA             = jjnr[jidx];
213             jnrB             = jjnr[jidx+1];
214             j_coord_offsetA  = DIM*jnrA;
215             j_coord_offsetB  = DIM*jnrB;
216
217             /* load j atom coordinates */
218             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
219                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
220
221             /* Calculate displacement vector */
222             dx00             = _mm_sub_pd(ix0,jx0);
223             dy00             = _mm_sub_pd(iy0,jy0);
224             dz00             = _mm_sub_pd(iz0,jz0);
225             dx01             = _mm_sub_pd(ix0,jx1);
226             dy01             = _mm_sub_pd(iy0,jy1);
227             dz01             = _mm_sub_pd(iz0,jz1);
228             dx02             = _mm_sub_pd(ix0,jx2);
229             dy02             = _mm_sub_pd(iy0,jy2);
230             dz02             = _mm_sub_pd(iz0,jz2);
231             dx10             = _mm_sub_pd(ix1,jx0);
232             dy10             = _mm_sub_pd(iy1,jy0);
233             dz10             = _mm_sub_pd(iz1,jz0);
234             dx11             = _mm_sub_pd(ix1,jx1);
235             dy11             = _mm_sub_pd(iy1,jy1);
236             dz11             = _mm_sub_pd(iz1,jz1);
237             dx12             = _mm_sub_pd(ix1,jx2);
238             dy12             = _mm_sub_pd(iy1,jy2);
239             dz12             = _mm_sub_pd(iz1,jz2);
240             dx20             = _mm_sub_pd(ix2,jx0);
241             dy20             = _mm_sub_pd(iy2,jy0);
242             dz20             = _mm_sub_pd(iz2,jz0);
243             dx21             = _mm_sub_pd(ix2,jx1);
244             dy21             = _mm_sub_pd(iy2,jy1);
245             dz21             = _mm_sub_pd(iz2,jz1);
246             dx22             = _mm_sub_pd(ix2,jx2);
247             dy22             = _mm_sub_pd(iy2,jy2);
248             dz22             = _mm_sub_pd(iz2,jz2);
249
250             /* Calculate squared distance and things based on it */
251             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
252             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
253             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
254             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
255             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
256             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
257             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
258             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
259             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
260
261             rinv00           = sse2_invsqrt_d(rsq00);
262             rinv01           = sse2_invsqrt_d(rsq01);
263             rinv02           = sse2_invsqrt_d(rsq02);
264             rinv10           = sse2_invsqrt_d(rsq10);
265             rinv11           = sse2_invsqrt_d(rsq11);
266             rinv12           = sse2_invsqrt_d(rsq12);
267             rinv20           = sse2_invsqrt_d(rsq20);
268             rinv21           = sse2_invsqrt_d(rsq21);
269             rinv22           = sse2_invsqrt_d(rsq22);
270
271             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
272             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
273             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
274             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
275             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
276             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
277             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
278             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
279             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
280
281             fjx0             = _mm_setzero_pd();
282             fjy0             = _mm_setzero_pd();
283             fjz0             = _mm_setzero_pd();
284             fjx1             = _mm_setzero_pd();
285             fjy1             = _mm_setzero_pd();
286             fjz1             = _mm_setzero_pd();
287             fjx2             = _mm_setzero_pd();
288             fjy2             = _mm_setzero_pd();
289             fjz2             = _mm_setzero_pd();
290
291             /**************************
292              * CALCULATE INTERACTIONS *
293              **************************/
294
295             if (gmx_mm_any_lt(rsq00,rcutoff2))
296             {
297
298             r00              = _mm_mul_pd(rsq00,rinv00);
299
300             /* Calculate table index by multiplying r with table scale and truncate to integer */
301             rt               = _mm_mul_pd(r00,vftabscale);
302             vfitab           = _mm_cvttpd_epi32(rt);
303             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
304             vfitab           = _mm_slli_epi32(vfitab,3);
305
306             /* REACTION-FIELD ELECTROSTATICS */
307             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
308             felec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
309
310             /* CUBIC SPLINE TABLE DISPERSION */
311             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
312             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
313             GMX_MM_TRANSPOSE2_PD(Y,F);
314             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
315             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
316             GMX_MM_TRANSPOSE2_PD(G,H);
317             Heps             = _mm_mul_pd(vfeps,H);
318             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
319             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
320             vvdw6            = _mm_mul_pd(c6_00,VV);
321             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
322             fvdw6            = _mm_mul_pd(c6_00,FF);
323
324             /* CUBIC SPLINE TABLE REPULSION */
325             vfitab           = _mm_add_epi32(vfitab,ifour);
326             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
327             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
328             GMX_MM_TRANSPOSE2_PD(Y,F);
329             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
330             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
331             GMX_MM_TRANSPOSE2_PD(G,H);
332             Heps             = _mm_mul_pd(vfeps,H);
333             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
334             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
335             vvdw12           = _mm_mul_pd(c12_00,VV);
336             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
337             fvdw12           = _mm_mul_pd(c12_00,FF);
338             vvdw             = _mm_add_pd(vvdw12,vvdw6);
339             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
340
341             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
342
343             /* Update potential sum for this i atom from the interaction with this j atom. */
344             velec            = _mm_and_pd(velec,cutoff_mask);
345             velecsum         = _mm_add_pd(velecsum,velec);
346             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
347             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
348
349             fscal            = _mm_add_pd(felec,fvdw);
350
351             fscal            = _mm_and_pd(fscal,cutoff_mask);
352
353             /* Calculate temporary vectorial force */
354             tx               = _mm_mul_pd(fscal,dx00);
355             ty               = _mm_mul_pd(fscal,dy00);
356             tz               = _mm_mul_pd(fscal,dz00);
357
358             /* Update vectorial force */
359             fix0             = _mm_add_pd(fix0,tx);
360             fiy0             = _mm_add_pd(fiy0,ty);
361             fiz0             = _mm_add_pd(fiz0,tz);
362
363             fjx0             = _mm_add_pd(fjx0,tx);
364             fjy0             = _mm_add_pd(fjy0,ty);
365             fjz0             = _mm_add_pd(fjz0,tz);
366
367             }
368
369             /**************************
370              * CALCULATE INTERACTIONS *
371              **************************/
372
373             if (gmx_mm_any_lt(rsq01,rcutoff2))
374             {
375
376             /* REACTION-FIELD ELECTROSTATICS */
377             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
378             felec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
379
380             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
381
382             /* Update potential sum for this i atom from the interaction with this j atom. */
383             velec            = _mm_and_pd(velec,cutoff_mask);
384             velecsum         = _mm_add_pd(velecsum,velec);
385
386             fscal            = felec;
387
388             fscal            = _mm_and_pd(fscal,cutoff_mask);
389
390             /* Calculate temporary vectorial force */
391             tx               = _mm_mul_pd(fscal,dx01);
392             ty               = _mm_mul_pd(fscal,dy01);
393             tz               = _mm_mul_pd(fscal,dz01);
394
395             /* Update vectorial force */
396             fix0             = _mm_add_pd(fix0,tx);
397             fiy0             = _mm_add_pd(fiy0,ty);
398             fiz0             = _mm_add_pd(fiz0,tz);
399
400             fjx1             = _mm_add_pd(fjx1,tx);
401             fjy1             = _mm_add_pd(fjy1,ty);
402             fjz1             = _mm_add_pd(fjz1,tz);
403
404             }
405
406             /**************************
407              * CALCULATE INTERACTIONS *
408              **************************/
409
410             if (gmx_mm_any_lt(rsq02,rcutoff2))
411             {
412
413             /* REACTION-FIELD ELECTROSTATICS */
414             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
415             felec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
416
417             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
418
419             /* Update potential sum for this i atom from the interaction with this j atom. */
420             velec            = _mm_and_pd(velec,cutoff_mask);
421             velecsum         = _mm_add_pd(velecsum,velec);
422
423             fscal            = felec;
424
425             fscal            = _mm_and_pd(fscal,cutoff_mask);
426
427             /* Calculate temporary vectorial force */
428             tx               = _mm_mul_pd(fscal,dx02);
429             ty               = _mm_mul_pd(fscal,dy02);
430             tz               = _mm_mul_pd(fscal,dz02);
431
432             /* Update vectorial force */
433             fix0             = _mm_add_pd(fix0,tx);
434             fiy0             = _mm_add_pd(fiy0,ty);
435             fiz0             = _mm_add_pd(fiz0,tz);
436
437             fjx2             = _mm_add_pd(fjx2,tx);
438             fjy2             = _mm_add_pd(fjy2,ty);
439             fjz2             = _mm_add_pd(fjz2,tz);
440
441             }
442
443             /**************************
444              * CALCULATE INTERACTIONS *
445              **************************/
446
447             if (gmx_mm_any_lt(rsq10,rcutoff2))
448             {
449
450             /* REACTION-FIELD ELECTROSTATICS */
451             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
452             felec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
453
454             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
455
456             /* Update potential sum for this i atom from the interaction with this j atom. */
457             velec            = _mm_and_pd(velec,cutoff_mask);
458             velecsum         = _mm_add_pd(velecsum,velec);
459
460             fscal            = felec;
461
462             fscal            = _mm_and_pd(fscal,cutoff_mask);
463
464             /* Calculate temporary vectorial force */
465             tx               = _mm_mul_pd(fscal,dx10);
466             ty               = _mm_mul_pd(fscal,dy10);
467             tz               = _mm_mul_pd(fscal,dz10);
468
469             /* Update vectorial force */
470             fix1             = _mm_add_pd(fix1,tx);
471             fiy1             = _mm_add_pd(fiy1,ty);
472             fiz1             = _mm_add_pd(fiz1,tz);
473
474             fjx0             = _mm_add_pd(fjx0,tx);
475             fjy0             = _mm_add_pd(fjy0,ty);
476             fjz0             = _mm_add_pd(fjz0,tz);
477
478             }
479
480             /**************************
481              * CALCULATE INTERACTIONS *
482              **************************/
483
484             if (gmx_mm_any_lt(rsq11,rcutoff2))
485             {
486
487             /* REACTION-FIELD ELECTROSTATICS */
488             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
489             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
490
491             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
492
493             /* Update potential sum for this i atom from the interaction with this j atom. */
494             velec            = _mm_and_pd(velec,cutoff_mask);
495             velecsum         = _mm_add_pd(velecsum,velec);
496
497             fscal            = felec;
498
499             fscal            = _mm_and_pd(fscal,cutoff_mask);
500
501             /* Calculate temporary vectorial force */
502             tx               = _mm_mul_pd(fscal,dx11);
503             ty               = _mm_mul_pd(fscal,dy11);
504             tz               = _mm_mul_pd(fscal,dz11);
505
506             /* Update vectorial force */
507             fix1             = _mm_add_pd(fix1,tx);
508             fiy1             = _mm_add_pd(fiy1,ty);
509             fiz1             = _mm_add_pd(fiz1,tz);
510
511             fjx1             = _mm_add_pd(fjx1,tx);
512             fjy1             = _mm_add_pd(fjy1,ty);
513             fjz1             = _mm_add_pd(fjz1,tz);
514
515             }
516
517             /**************************
518              * CALCULATE INTERACTIONS *
519              **************************/
520
521             if (gmx_mm_any_lt(rsq12,rcutoff2))
522             {
523
524             /* REACTION-FIELD ELECTROSTATICS */
525             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
526             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
527
528             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
529
530             /* Update potential sum for this i atom from the interaction with this j atom. */
531             velec            = _mm_and_pd(velec,cutoff_mask);
532             velecsum         = _mm_add_pd(velecsum,velec);
533
534             fscal            = felec;
535
536             fscal            = _mm_and_pd(fscal,cutoff_mask);
537
538             /* Calculate temporary vectorial force */
539             tx               = _mm_mul_pd(fscal,dx12);
540             ty               = _mm_mul_pd(fscal,dy12);
541             tz               = _mm_mul_pd(fscal,dz12);
542
543             /* Update vectorial force */
544             fix1             = _mm_add_pd(fix1,tx);
545             fiy1             = _mm_add_pd(fiy1,ty);
546             fiz1             = _mm_add_pd(fiz1,tz);
547
548             fjx2             = _mm_add_pd(fjx2,tx);
549             fjy2             = _mm_add_pd(fjy2,ty);
550             fjz2             = _mm_add_pd(fjz2,tz);
551
552             }
553
554             /**************************
555              * CALCULATE INTERACTIONS *
556              **************************/
557
558             if (gmx_mm_any_lt(rsq20,rcutoff2))
559             {
560
561             /* REACTION-FIELD ELECTROSTATICS */
562             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
563             felec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
564
565             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
566
567             /* Update potential sum for this i atom from the interaction with this j atom. */
568             velec            = _mm_and_pd(velec,cutoff_mask);
569             velecsum         = _mm_add_pd(velecsum,velec);
570
571             fscal            = felec;
572
573             fscal            = _mm_and_pd(fscal,cutoff_mask);
574
575             /* Calculate temporary vectorial force */
576             tx               = _mm_mul_pd(fscal,dx20);
577             ty               = _mm_mul_pd(fscal,dy20);
578             tz               = _mm_mul_pd(fscal,dz20);
579
580             /* Update vectorial force */
581             fix2             = _mm_add_pd(fix2,tx);
582             fiy2             = _mm_add_pd(fiy2,ty);
583             fiz2             = _mm_add_pd(fiz2,tz);
584
585             fjx0             = _mm_add_pd(fjx0,tx);
586             fjy0             = _mm_add_pd(fjy0,ty);
587             fjz0             = _mm_add_pd(fjz0,tz);
588
589             }
590
591             /**************************
592              * CALCULATE INTERACTIONS *
593              **************************/
594
595             if (gmx_mm_any_lt(rsq21,rcutoff2))
596             {
597
598             /* REACTION-FIELD ELECTROSTATICS */
599             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
600             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
601
602             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
603
604             /* Update potential sum for this i atom from the interaction with this j atom. */
605             velec            = _mm_and_pd(velec,cutoff_mask);
606             velecsum         = _mm_add_pd(velecsum,velec);
607
608             fscal            = felec;
609
610             fscal            = _mm_and_pd(fscal,cutoff_mask);
611
612             /* Calculate temporary vectorial force */
613             tx               = _mm_mul_pd(fscal,dx21);
614             ty               = _mm_mul_pd(fscal,dy21);
615             tz               = _mm_mul_pd(fscal,dz21);
616
617             /* Update vectorial force */
618             fix2             = _mm_add_pd(fix2,tx);
619             fiy2             = _mm_add_pd(fiy2,ty);
620             fiz2             = _mm_add_pd(fiz2,tz);
621
622             fjx1             = _mm_add_pd(fjx1,tx);
623             fjy1             = _mm_add_pd(fjy1,ty);
624             fjz1             = _mm_add_pd(fjz1,tz);
625
626             }
627
628             /**************************
629              * CALCULATE INTERACTIONS *
630              **************************/
631
632             if (gmx_mm_any_lt(rsq22,rcutoff2))
633             {
634
635             /* REACTION-FIELD ELECTROSTATICS */
636             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
637             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
638
639             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
640
641             /* Update potential sum for this i atom from the interaction with this j atom. */
642             velec            = _mm_and_pd(velec,cutoff_mask);
643             velecsum         = _mm_add_pd(velecsum,velec);
644
645             fscal            = felec;
646
647             fscal            = _mm_and_pd(fscal,cutoff_mask);
648
649             /* Calculate temporary vectorial force */
650             tx               = _mm_mul_pd(fscal,dx22);
651             ty               = _mm_mul_pd(fscal,dy22);
652             tz               = _mm_mul_pd(fscal,dz22);
653
654             /* Update vectorial force */
655             fix2             = _mm_add_pd(fix2,tx);
656             fiy2             = _mm_add_pd(fiy2,ty);
657             fiz2             = _mm_add_pd(fiz2,tz);
658
659             fjx2             = _mm_add_pd(fjx2,tx);
660             fjy2             = _mm_add_pd(fjy2,ty);
661             fjz2             = _mm_add_pd(fjz2,tz);
662
663             }
664
665             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
666
667             /* Inner loop uses 360 flops */
668         }
669
670         if(jidx<j_index_end)
671         {
672
673             jnrA             = jjnr[jidx];
674             j_coord_offsetA  = DIM*jnrA;
675
676             /* load j atom coordinates */
677             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
678                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
679
680             /* Calculate displacement vector */
681             dx00             = _mm_sub_pd(ix0,jx0);
682             dy00             = _mm_sub_pd(iy0,jy0);
683             dz00             = _mm_sub_pd(iz0,jz0);
684             dx01             = _mm_sub_pd(ix0,jx1);
685             dy01             = _mm_sub_pd(iy0,jy1);
686             dz01             = _mm_sub_pd(iz0,jz1);
687             dx02             = _mm_sub_pd(ix0,jx2);
688             dy02             = _mm_sub_pd(iy0,jy2);
689             dz02             = _mm_sub_pd(iz0,jz2);
690             dx10             = _mm_sub_pd(ix1,jx0);
691             dy10             = _mm_sub_pd(iy1,jy0);
692             dz10             = _mm_sub_pd(iz1,jz0);
693             dx11             = _mm_sub_pd(ix1,jx1);
694             dy11             = _mm_sub_pd(iy1,jy1);
695             dz11             = _mm_sub_pd(iz1,jz1);
696             dx12             = _mm_sub_pd(ix1,jx2);
697             dy12             = _mm_sub_pd(iy1,jy2);
698             dz12             = _mm_sub_pd(iz1,jz2);
699             dx20             = _mm_sub_pd(ix2,jx0);
700             dy20             = _mm_sub_pd(iy2,jy0);
701             dz20             = _mm_sub_pd(iz2,jz0);
702             dx21             = _mm_sub_pd(ix2,jx1);
703             dy21             = _mm_sub_pd(iy2,jy1);
704             dz21             = _mm_sub_pd(iz2,jz1);
705             dx22             = _mm_sub_pd(ix2,jx2);
706             dy22             = _mm_sub_pd(iy2,jy2);
707             dz22             = _mm_sub_pd(iz2,jz2);
708
709             /* Calculate squared distance and things based on it */
710             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
711             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
712             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
713             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
714             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
715             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
716             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
717             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
718             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
719
720             rinv00           = sse2_invsqrt_d(rsq00);
721             rinv01           = sse2_invsqrt_d(rsq01);
722             rinv02           = sse2_invsqrt_d(rsq02);
723             rinv10           = sse2_invsqrt_d(rsq10);
724             rinv11           = sse2_invsqrt_d(rsq11);
725             rinv12           = sse2_invsqrt_d(rsq12);
726             rinv20           = sse2_invsqrt_d(rsq20);
727             rinv21           = sse2_invsqrt_d(rsq21);
728             rinv22           = sse2_invsqrt_d(rsq22);
729
730             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
731             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
732             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
733             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
734             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
735             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
736             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
737             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
738             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
739
740             fjx0             = _mm_setzero_pd();
741             fjy0             = _mm_setzero_pd();
742             fjz0             = _mm_setzero_pd();
743             fjx1             = _mm_setzero_pd();
744             fjy1             = _mm_setzero_pd();
745             fjz1             = _mm_setzero_pd();
746             fjx2             = _mm_setzero_pd();
747             fjy2             = _mm_setzero_pd();
748             fjz2             = _mm_setzero_pd();
749
750             /**************************
751              * CALCULATE INTERACTIONS *
752              **************************/
753
754             if (gmx_mm_any_lt(rsq00,rcutoff2))
755             {
756
757             r00              = _mm_mul_pd(rsq00,rinv00);
758
759             /* Calculate table index by multiplying r with table scale and truncate to integer */
760             rt               = _mm_mul_pd(r00,vftabscale);
761             vfitab           = _mm_cvttpd_epi32(rt);
762             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
763             vfitab           = _mm_slli_epi32(vfitab,3);
764
765             /* REACTION-FIELD ELECTROSTATICS */
766             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
767             felec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
768
769             /* CUBIC SPLINE TABLE DISPERSION */
770             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
771             F                = _mm_setzero_pd();
772             GMX_MM_TRANSPOSE2_PD(Y,F);
773             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
774             H                = _mm_setzero_pd();
775             GMX_MM_TRANSPOSE2_PD(G,H);
776             Heps             = _mm_mul_pd(vfeps,H);
777             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
778             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
779             vvdw6            = _mm_mul_pd(c6_00,VV);
780             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
781             fvdw6            = _mm_mul_pd(c6_00,FF);
782
783             /* CUBIC SPLINE TABLE REPULSION */
784             vfitab           = _mm_add_epi32(vfitab,ifour);
785             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
786             F                = _mm_setzero_pd();
787             GMX_MM_TRANSPOSE2_PD(Y,F);
788             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
789             H                = _mm_setzero_pd();
790             GMX_MM_TRANSPOSE2_PD(G,H);
791             Heps             = _mm_mul_pd(vfeps,H);
792             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
793             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
794             vvdw12           = _mm_mul_pd(c12_00,VV);
795             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
796             fvdw12           = _mm_mul_pd(c12_00,FF);
797             vvdw             = _mm_add_pd(vvdw12,vvdw6);
798             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
799
800             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
801
802             /* Update potential sum for this i atom from the interaction with this j atom. */
803             velec            = _mm_and_pd(velec,cutoff_mask);
804             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
805             velecsum         = _mm_add_pd(velecsum,velec);
806             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
807             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
808             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
809
810             fscal            = _mm_add_pd(felec,fvdw);
811
812             fscal            = _mm_and_pd(fscal,cutoff_mask);
813
814             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
815
816             /* Calculate temporary vectorial force */
817             tx               = _mm_mul_pd(fscal,dx00);
818             ty               = _mm_mul_pd(fscal,dy00);
819             tz               = _mm_mul_pd(fscal,dz00);
820
821             /* Update vectorial force */
822             fix0             = _mm_add_pd(fix0,tx);
823             fiy0             = _mm_add_pd(fiy0,ty);
824             fiz0             = _mm_add_pd(fiz0,tz);
825
826             fjx0             = _mm_add_pd(fjx0,tx);
827             fjy0             = _mm_add_pd(fjy0,ty);
828             fjz0             = _mm_add_pd(fjz0,tz);
829
830             }
831
832             /**************************
833              * CALCULATE INTERACTIONS *
834              **************************/
835
836             if (gmx_mm_any_lt(rsq01,rcutoff2))
837             {
838
839             /* REACTION-FIELD ELECTROSTATICS */
840             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
841             felec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
842
843             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
844
845             /* Update potential sum for this i atom from the interaction with this j atom. */
846             velec            = _mm_and_pd(velec,cutoff_mask);
847             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
848             velecsum         = _mm_add_pd(velecsum,velec);
849
850             fscal            = felec;
851
852             fscal            = _mm_and_pd(fscal,cutoff_mask);
853
854             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
855
856             /* Calculate temporary vectorial force */
857             tx               = _mm_mul_pd(fscal,dx01);
858             ty               = _mm_mul_pd(fscal,dy01);
859             tz               = _mm_mul_pd(fscal,dz01);
860
861             /* Update vectorial force */
862             fix0             = _mm_add_pd(fix0,tx);
863             fiy0             = _mm_add_pd(fiy0,ty);
864             fiz0             = _mm_add_pd(fiz0,tz);
865
866             fjx1             = _mm_add_pd(fjx1,tx);
867             fjy1             = _mm_add_pd(fjy1,ty);
868             fjz1             = _mm_add_pd(fjz1,tz);
869
870             }
871
872             /**************************
873              * CALCULATE INTERACTIONS *
874              **************************/
875
876             if (gmx_mm_any_lt(rsq02,rcutoff2))
877             {
878
879             /* REACTION-FIELD ELECTROSTATICS */
880             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
881             felec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
882
883             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
884
885             /* Update potential sum for this i atom from the interaction with this j atom. */
886             velec            = _mm_and_pd(velec,cutoff_mask);
887             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
888             velecsum         = _mm_add_pd(velecsum,velec);
889
890             fscal            = felec;
891
892             fscal            = _mm_and_pd(fscal,cutoff_mask);
893
894             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
895
896             /* Calculate temporary vectorial force */
897             tx               = _mm_mul_pd(fscal,dx02);
898             ty               = _mm_mul_pd(fscal,dy02);
899             tz               = _mm_mul_pd(fscal,dz02);
900
901             /* Update vectorial force */
902             fix0             = _mm_add_pd(fix0,tx);
903             fiy0             = _mm_add_pd(fiy0,ty);
904             fiz0             = _mm_add_pd(fiz0,tz);
905
906             fjx2             = _mm_add_pd(fjx2,tx);
907             fjy2             = _mm_add_pd(fjy2,ty);
908             fjz2             = _mm_add_pd(fjz2,tz);
909
910             }
911
912             /**************************
913              * CALCULATE INTERACTIONS *
914              **************************/
915
916             if (gmx_mm_any_lt(rsq10,rcutoff2))
917             {
918
919             /* REACTION-FIELD ELECTROSTATICS */
920             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
921             felec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
922
923             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
924
925             /* Update potential sum for this i atom from the interaction with this j atom. */
926             velec            = _mm_and_pd(velec,cutoff_mask);
927             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
928             velecsum         = _mm_add_pd(velecsum,velec);
929
930             fscal            = felec;
931
932             fscal            = _mm_and_pd(fscal,cutoff_mask);
933
934             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
935
936             /* Calculate temporary vectorial force */
937             tx               = _mm_mul_pd(fscal,dx10);
938             ty               = _mm_mul_pd(fscal,dy10);
939             tz               = _mm_mul_pd(fscal,dz10);
940
941             /* Update vectorial force */
942             fix1             = _mm_add_pd(fix1,tx);
943             fiy1             = _mm_add_pd(fiy1,ty);
944             fiz1             = _mm_add_pd(fiz1,tz);
945
946             fjx0             = _mm_add_pd(fjx0,tx);
947             fjy0             = _mm_add_pd(fjy0,ty);
948             fjz0             = _mm_add_pd(fjz0,tz);
949
950             }
951
952             /**************************
953              * CALCULATE INTERACTIONS *
954              **************************/
955
956             if (gmx_mm_any_lt(rsq11,rcutoff2))
957             {
958
959             /* REACTION-FIELD ELECTROSTATICS */
960             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
961             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
962
963             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
964
965             /* Update potential sum for this i atom from the interaction with this j atom. */
966             velec            = _mm_and_pd(velec,cutoff_mask);
967             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
968             velecsum         = _mm_add_pd(velecsum,velec);
969
970             fscal            = felec;
971
972             fscal            = _mm_and_pd(fscal,cutoff_mask);
973
974             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
975
976             /* Calculate temporary vectorial force */
977             tx               = _mm_mul_pd(fscal,dx11);
978             ty               = _mm_mul_pd(fscal,dy11);
979             tz               = _mm_mul_pd(fscal,dz11);
980
981             /* Update vectorial force */
982             fix1             = _mm_add_pd(fix1,tx);
983             fiy1             = _mm_add_pd(fiy1,ty);
984             fiz1             = _mm_add_pd(fiz1,tz);
985
986             fjx1             = _mm_add_pd(fjx1,tx);
987             fjy1             = _mm_add_pd(fjy1,ty);
988             fjz1             = _mm_add_pd(fjz1,tz);
989
990             }
991
992             /**************************
993              * CALCULATE INTERACTIONS *
994              **************************/
995
996             if (gmx_mm_any_lt(rsq12,rcutoff2))
997             {
998
999             /* REACTION-FIELD ELECTROSTATICS */
1000             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
1001             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1002
1003             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1004
1005             /* Update potential sum for this i atom from the interaction with this j atom. */
1006             velec            = _mm_and_pd(velec,cutoff_mask);
1007             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1008             velecsum         = _mm_add_pd(velecsum,velec);
1009
1010             fscal            = felec;
1011
1012             fscal            = _mm_and_pd(fscal,cutoff_mask);
1013
1014             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1015
1016             /* Calculate temporary vectorial force */
1017             tx               = _mm_mul_pd(fscal,dx12);
1018             ty               = _mm_mul_pd(fscal,dy12);
1019             tz               = _mm_mul_pd(fscal,dz12);
1020
1021             /* Update vectorial force */
1022             fix1             = _mm_add_pd(fix1,tx);
1023             fiy1             = _mm_add_pd(fiy1,ty);
1024             fiz1             = _mm_add_pd(fiz1,tz);
1025
1026             fjx2             = _mm_add_pd(fjx2,tx);
1027             fjy2             = _mm_add_pd(fjy2,ty);
1028             fjz2             = _mm_add_pd(fjz2,tz);
1029
1030             }
1031
1032             /**************************
1033              * CALCULATE INTERACTIONS *
1034              **************************/
1035
1036             if (gmx_mm_any_lt(rsq20,rcutoff2))
1037             {
1038
1039             /* REACTION-FIELD ELECTROSTATICS */
1040             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
1041             felec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1042
1043             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1044
1045             /* Update potential sum for this i atom from the interaction with this j atom. */
1046             velec            = _mm_and_pd(velec,cutoff_mask);
1047             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1048             velecsum         = _mm_add_pd(velecsum,velec);
1049
1050             fscal            = felec;
1051
1052             fscal            = _mm_and_pd(fscal,cutoff_mask);
1053
1054             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1055
1056             /* Calculate temporary vectorial force */
1057             tx               = _mm_mul_pd(fscal,dx20);
1058             ty               = _mm_mul_pd(fscal,dy20);
1059             tz               = _mm_mul_pd(fscal,dz20);
1060
1061             /* Update vectorial force */
1062             fix2             = _mm_add_pd(fix2,tx);
1063             fiy2             = _mm_add_pd(fiy2,ty);
1064             fiz2             = _mm_add_pd(fiz2,tz);
1065
1066             fjx0             = _mm_add_pd(fjx0,tx);
1067             fjy0             = _mm_add_pd(fjy0,ty);
1068             fjz0             = _mm_add_pd(fjz0,tz);
1069
1070             }
1071
1072             /**************************
1073              * CALCULATE INTERACTIONS *
1074              **************************/
1075
1076             if (gmx_mm_any_lt(rsq21,rcutoff2))
1077             {
1078
1079             /* REACTION-FIELD ELECTROSTATICS */
1080             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
1081             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1082
1083             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1084
1085             /* Update potential sum for this i atom from the interaction with this j atom. */
1086             velec            = _mm_and_pd(velec,cutoff_mask);
1087             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1088             velecsum         = _mm_add_pd(velecsum,velec);
1089
1090             fscal            = felec;
1091
1092             fscal            = _mm_and_pd(fscal,cutoff_mask);
1093
1094             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1095
1096             /* Calculate temporary vectorial force */
1097             tx               = _mm_mul_pd(fscal,dx21);
1098             ty               = _mm_mul_pd(fscal,dy21);
1099             tz               = _mm_mul_pd(fscal,dz21);
1100
1101             /* Update vectorial force */
1102             fix2             = _mm_add_pd(fix2,tx);
1103             fiy2             = _mm_add_pd(fiy2,ty);
1104             fiz2             = _mm_add_pd(fiz2,tz);
1105
1106             fjx1             = _mm_add_pd(fjx1,tx);
1107             fjy1             = _mm_add_pd(fjy1,ty);
1108             fjz1             = _mm_add_pd(fjz1,tz);
1109
1110             }
1111
1112             /**************************
1113              * CALCULATE INTERACTIONS *
1114              **************************/
1115
1116             if (gmx_mm_any_lt(rsq22,rcutoff2))
1117             {
1118
1119             /* REACTION-FIELD ELECTROSTATICS */
1120             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1121             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1122
1123             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1124
1125             /* Update potential sum for this i atom from the interaction with this j atom. */
1126             velec            = _mm_and_pd(velec,cutoff_mask);
1127             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1128             velecsum         = _mm_add_pd(velecsum,velec);
1129
1130             fscal            = felec;
1131
1132             fscal            = _mm_and_pd(fscal,cutoff_mask);
1133
1134             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1135
1136             /* Calculate temporary vectorial force */
1137             tx               = _mm_mul_pd(fscal,dx22);
1138             ty               = _mm_mul_pd(fscal,dy22);
1139             tz               = _mm_mul_pd(fscal,dz22);
1140
1141             /* Update vectorial force */
1142             fix2             = _mm_add_pd(fix2,tx);
1143             fiy2             = _mm_add_pd(fiy2,ty);
1144             fiz2             = _mm_add_pd(fiz2,tz);
1145
1146             fjx2             = _mm_add_pd(fjx2,tx);
1147             fjy2             = _mm_add_pd(fjy2,ty);
1148             fjz2             = _mm_add_pd(fjz2,tz);
1149
1150             }
1151
1152             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1153
1154             /* Inner loop uses 360 flops */
1155         }
1156
1157         /* End of innermost loop */
1158
1159         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1160                                               f+i_coord_offset,fshift+i_shift_offset);
1161
1162         ggid                        = gid[iidx];
1163         /* Update potential energies */
1164         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1165         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1166
1167         /* Increment number of inner iterations */
1168         inneriter                  += j_index_end - j_index_start;
1169
1170         /* Outer loop uses 20 flops */
1171     }
1172
1173     /* Increment number of outer iterations */
1174     outeriter        += nri;
1175
1176     /* Update outer/inner flops */
1177
1178     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*360);
1179 }
1180 /*
1181  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double
1182  * Electrostatics interaction: ReactionField
1183  * VdW interaction:            CubicSplineTable
1184  * Geometry:                   Water3-Water3
1185  * Calculate force/pot:        Force
1186  */
1187 void
1188 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double
1189                     (t_nblist                    * gmx_restrict       nlist,
1190                      rvec                        * gmx_restrict          xx,
1191                      rvec                        * gmx_restrict          ff,
1192                      struct t_forcerec           * gmx_restrict          fr,
1193                      t_mdatoms                   * gmx_restrict     mdatoms,
1194                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1195                      t_nrnb                      * gmx_restrict        nrnb)
1196 {
1197     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1198      * just 0 for non-waters.
1199      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1200      * jnr indices corresponding to data put in the four positions in the SIMD register.
1201      */
1202     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1203     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1204     int              jnrA,jnrB;
1205     int              j_coord_offsetA,j_coord_offsetB;
1206     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1207     real             rcutoff_scalar;
1208     real             *shiftvec,*fshift,*x,*f;
1209     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1210     int              vdwioffset0;
1211     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1212     int              vdwioffset1;
1213     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1214     int              vdwioffset2;
1215     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1216     int              vdwjidx0A,vdwjidx0B;
1217     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1218     int              vdwjidx1A,vdwjidx1B;
1219     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1220     int              vdwjidx2A,vdwjidx2B;
1221     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1222     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1223     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1224     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1225     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1226     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1227     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1228     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1229     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1230     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1231     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1232     real             *charge;
1233     int              nvdwtype;
1234     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1235     int              *vdwtype;
1236     real             *vdwparam;
1237     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1238     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1239     __m128i          vfitab;
1240     __m128i          ifour       = _mm_set1_epi32(4);
1241     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1242     real             *vftab;
1243     __m128d          dummy_mask,cutoff_mask;
1244     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1245     __m128d          one     = _mm_set1_pd(1.0);
1246     __m128d          two     = _mm_set1_pd(2.0);
1247     x                = xx[0];
1248     f                = ff[0];
1249
1250     nri              = nlist->nri;
1251     iinr             = nlist->iinr;
1252     jindex           = nlist->jindex;
1253     jjnr             = nlist->jjnr;
1254     shiftidx         = nlist->shift;
1255     gid              = nlist->gid;
1256     shiftvec         = fr->shift_vec[0];
1257     fshift           = fr->fshift[0];
1258     facel            = _mm_set1_pd(fr->ic->epsfac);
1259     charge           = mdatoms->chargeA;
1260     krf              = _mm_set1_pd(fr->ic->k_rf);
1261     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
1262     crf              = _mm_set1_pd(fr->ic->c_rf);
1263     nvdwtype         = fr->ntype;
1264     vdwparam         = fr->nbfp;
1265     vdwtype          = mdatoms->typeA;
1266
1267     vftab            = kernel_data->table_vdw->data;
1268     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
1269
1270     /* Setup water-specific parameters */
1271     inr              = nlist->iinr[0];
1272     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1273     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1274     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1275     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1276
1277     jq0              = _mm_set1_pd(charge[inr+0]);
1278     jq1              = _mm_set1_pd(charge[inr+1]);
1279     jq2              = _mm_set1_pd(charge[inr+2]);
1280     vdwjidx0A        = 2*vdwtype[inr+0];
1281     qq00             = _mm_mul_pd(iq0,jq0);
1282     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1283     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1284     qq01             = _mm_mul_pd(iq0,jq1);
1285     qq02             = _mm_mul_pd(iq0,jq2);
1286     qq10             = _mm_mul_pd(iq1,jq0);
1287     qq11             = _mm_mul_pd(iq1,jq1);
1288     qq12             = _mm_mul_pd(iq1,jq2);
1289     qq20             = _mm_mul_pd(iq2,jq0);
1290     qq21             = _mm_mul_pd(iq2,jq1);
1291     qq22             = _mm_mul_pd(iq2,jq2);
1292
1293     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1294     rcutoff_scalar   = fr->ic->rcoulomb;
1295     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1296     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1297
1298     /* Avoid stupid compiler warnings */
1299     jnrA = jnrB = 0;
1300     j_coord_offsetA = 0;
1301     j_coord_offsetB = 0;
1302
1303     outeriter        = 0;
1304     inneriter        = 0;
1305
1306     /* Start outer loop over neighborlists */
1307     for(iidx=0; iidx<nri; iidx++)
1308     {
1309         /* Load shift vector for this list */
1310         i_shift_offset   = DIM*shiftidx[iidx];
1311
1312         /* Load limits for loop over neighbors */
1313         j_index_start    = jindex[iidx];
1314         j_index_end      = jindex[iidx+1];
1315
1316         /* Get outer coordinate index */
1317         inr              = iinr[iidx];
1318         i_coord_offset   = DIM*inr;
1319
1320         /* Load i particle coords and add shift vector */
1321         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1322                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1323
1324         fix0             = _mm_setzero_pd();
1325         fiy0             = _mm_setzero_pd();
1326         fiz0             = _mm_setzero_pd();
1327         fix1             = _mm_setzero_pd();
1328         fiy1             = _mm_setzero_pd();
1329         fiz1             = _mm_setzero_pd();
1330         fix2             = _mm_setzero_pd();
1331         fiy2             = _mm_setzero_pd();
1332         fiz2             = _mm_setzero_pd();
1333
1334         /* Start inner kernel loop */
1335         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1336         {
1337
1338             /* Get j neighbor index, and coordinate index */
1339             jnrA             = jjnr[jidx];
1340             jnrB             = jjnr[jidx+1];
1341             j_coord_offsetA  = DIM*jnrA;
1342             j_coord_offsetB  = DIM*jnrB;
1343
1344             /* load j atom coordinates */
1345             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1346                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1347
1348             /* Calculate displacement vector */
1349             dx00             = _mm_sub_pd(ix0,jx0);
1350             dy00             = _mm_sub_pd(iy0,jy0);
1351             dz00             = _mm_sub_pd(iz0,jz0);
1352             dx01             = _mm_sub_pd(ix0,jx1);
1353             dy01             = _mm_sub_pd(iy0,jy1);
1354             dz01             = _mm_sub_pd(iz0,jz1);
1355             dx02             = _mm_sub_pd(ix0,jx2);
1356             dy02             = _mm_sub_pd(iy0,jy2);
1357             dz02             = _mm_sub_pd(iz0,jz2);
1358             dx10             = _mm_sub_pd(ix1,jx0);
1359             dy10             = _mm_sub_pd(iy1,jy0);
1360             dz10             = _mm_sub_pd(iz1,jz0);
1361             dx11             = _mm_sub_pd(ix1,jx1);
1362             dy11             = _mm_sub_pd(iy1,jy1);
1363             dz11             = _mm_sub_pd(iz1,jz1);
1364             dx12             = _mm_sub_pd(ix1,jx2);
1365             dy12             = _mm_sub_pd(iy1,jy2);
1366             dz12             = _mm_sub_pd(iz1,jz2);
1367             dx20             = _mm_sub_pd(ix2,jx0);
1368             dy20             = _mm_sub_pd(iy2,jy0);
1369             dz20             = _mm_sub_pd(iz2,jz0);
1370             dx21             = _mm_sub_pd(ix2,jx1);
1371             dy21             = _mm_sub_pd(iy2,jy1);
1372             dz21             = _mm_sub_pd(iz2,jz1);
1373             dx22             = _mm_sub_pd(ix2,jx2);
1374             dy22             = _mm_sub_pd(iy2,jy2);
1375             dz22             = _mm_sub_pd(iz2,jz2);
1376
1377             /* Calculate squared distance and things based on it */
1378             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1379             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1380             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1381             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1382             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1383             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1384             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1385             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1386             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1387
1388             rinv00           = sse2_invsqrt_d(rsq00);
1389             rinv01           = sse2_invsqrt_d(rsq01);
1390             rinv02           = sse2_invsqrt_d(rsq02);
1391             rinv10           = sse2_invsqrt_d(rsq10);
1392             rinv11           = sse2_invsqrt_d(rsq11);
1393             rinv12           = sse2_invsqrt_d(rsq12);
1394             rinv20           = sse2_invsqrt_d(rsq20);
1395             rinv21           = sse2_invsqrt_d(rsq21);
1396             rinv22           = sse2_invsqrt_d(rsq22);
1397
1398             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1399             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1400             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1401             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1402             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1403             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1404             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1405             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1406             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1407
1408             fjx0             = _mm_setzero_pd();
1409             fjy0             = _mm_setzero_pd();
1410             fjz0             = _mm_setzero_pd();
1411             fjx1             = _mm_setzero_pd();
1412             fjy1             = _mm_setzero_pd();
1413             fjz1             = _mm_setzero_pd();
1414             fjx2             = _mm_setzero_pd();
1415             fjy2             = _mm_setzero_pd();
1416             fjz2             = _mm_setzero_pd();
1417
1418             /**************************
1419              * CALCULATE INTERACTIONS *
1420              **************************/
1421
1422             if (gmx_mm_any_lt(rsq00,rcutoff2))
1423             {
1424
1425             r00              = _mm_mul_pd(rsq00,rinv00);
1426
1427             /* Calculate table index by multiplying r with table scale and truncate to integer */
1428             rt               = _mm_mul_pd(r00,vftabscale);
1429             vfitab           = _mm_cvttpd_epi32(rt);
1430             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1431             vfitab           = _mm_slli_epi32(vfitab,3);
1432
1433             /* REACTION-FIELD ELECTROSTATICS */
1434             felec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1435
1436             /* CUBIC SPLINE TABLE DISPERSION */
1437             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1438             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1439             GMX_MM_TRANSPOSE2_PD(Y,F);
1440             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1441             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1442             GMX_MM_TRANSPOSE2_PD(G,H);
1443             Heps             = _mm_mul_pd(vfeps,H);
1444             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1445             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1446             fvdw6            = _mm_mul_pd(c6_00,FF);
1447
1448             /* CUBIC SPLINE TABLE REPULSION */
1449             vfitab           = _mm_add_epi32(vfitab,ifour);
1450             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1451             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1452             GMX_MM_TRANSPOSE2_PD(Y,F);
1453             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1454             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1455             GMX_MM_TRANSPOSE2_PD(G,H);
1456             Heps             = _mm_mul_pd(vfeps,H);
1457             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1458             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1459             fvdw12           = _mm_mul_pd(c12_00,FF);
1460             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1461
1462             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1463
1464             fscal            = _mm_add_pd(felec,fvdw);
1465
1466             fscal            = _mm_and_pd(fscal,cutoff_mask);
1467
1468             /* Calculate temporary vectorial force */
1469             tx               = _mm_mul_pd(fscal,dx00);
1470             ty               = _mm_mul_pd(fscal,dy00);
1471             tz               = _mm_mul_pd(fscal,dz00);
1472
1473             /* Update vectorial force */
1474             fix0             = _mm_add_pd(fix0,tx);
1475             fiy0             = _mm_add_pd(fiy0,ty);
1476             fiz0             = _mm_add_pd(fiz0,tz);
1477
1478             fjx0             = _mm_add_pd(fjx0,tx);
1479             fjy0             = _mm_add_pd(fjy0,ty);
1480             fjz0             = _mm_add_pd(fjz0,tz);
1481
1482             }
1483
1484             /**************************
1485              * CALCULATE INTERACTIONS *
1486              **************************/
1487
1488             if (gmx_mm_any_lt(rsq01,rcutoff2))
1489             {
1490
1491             /* REACTION-FIELD ELECTROSTATICS */
1492             felec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1493
1494             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1495
1496             fscal            = felec;
1497
1498             fscal            = _mm_and_pd(fscal,cutoff_mask);
1499
1500             /* Calculate temporary vectorial force */
1501             tx               = _mm_mul_pd(fscal,dx01);
1502             ty               = _mm_mul_pd(fscal,dy01);
1503             tz               = _mm_mul_pd(fscal,dz01);
1504
1505             /* Update vectorial force */
1506             fix0             = _mm_add_pd(fix0,tx);
1507             fiy0             = _mm_add_pd(fiy0,ty);
1508             fiz0             = _mm_add_pd(fiz0,tz);
1509
1510             fjx1             = _mm_add_pd(fjx1,tx);
1511             fjy1             = _mm_add_pd(fjy1,ty);
1512             fjz1             = _mm_add_pd(fjz1,tz);
1513
1514             }
1515
1516             /**************************
1517              * CALCULATE INTERACTIONS *
1518              **************************/
1519
1520             if (gmx_mm_any_lt(rsq02,rcutoff2))
1521             {
1522
1523             /* REACTION-FIELD ELECTROSTATICS */
1524             felec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1525
1526             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1527
1528             fscal            = felec;
1529
1530             fscal            = _mm_and_pd(fscal,cutoff_mask);
1531
1532             /* Calculate temporary vectorial force */
1533             tx               = _mm_mul_pd(fscal,dx02);
1534             ty               = _mm_mul_pd(fscal,dy02);
1535             tz               = _mm_mul_pd(fscal,dz02);
1536
1537             /* Update vectorial force */
1538             fix0             = _mm_add_pd(fix0,tx);
1539             fiy0             = _mm_add_pd(fiy0,ty);
1540             fiz0             = _mm_add_pd(fiz0,tz);
1541
1542             fjx2             = _mm_add_pd(fjx2,tx);
1543             fjy2             = _mm_add_pd(fjy2,ty);
1544             fjz2             = _mm_add_pd(fjz2,tz);
1545
1546             }
1547
1548             /**************************
1549              * CALCULATE INTERACTIONS *
1550              **************************/
1551
1552             if (gmx_mm_any_lt(rsq10,rcutoff2))
1553             {
1554
1555             /* REACTION-FIELD ELECTROSTATICS */
1556             felec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1557
1558             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1559
1560             fscal            = felec;
1561
1562             fscal            = _mm_and_pd(fscal,cutoff_mask);
1563
1564             /* Calculate temporary vectorial force */
1565             tx               = _mm_mul_pd(fscal,dx10);
1566             ty               = _mm_mul_pd(fscal,dy10);
1567             tz               = _mm_mul_pd(fscal,dz10);
1568
1569             /* Update vectorial force */
1570             fix1             = _mm_add_pd(fix1,tx);
1571             fiy1             = _mm_add_pd(fiy1,ty);
1572             fiz1             = _mm_add_pd(fiz1,tz);
1573
1574             fjx0             = _mm_add_pd(fjx0,tx);
1575             fjy0             = _mm_add_pd(fjy0,ty);
1576             fjz0             = _mm_add_pd(fjz0,tz);
1577
1578             }
1579
1580             /**************************
1581              * CALCULATE INTERACTIONS *
1582              **************************/
1583
1584             if (gmx_mm_any_lt(rsq11,rcutoff2))
1585             {
1586
1587             /* REACTION-FIELD ELECTROSTATICS */
1588             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1589
1590             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1591
1592             fscal            = felec;
1593
1594             fscal            = _mm_and_pd(fscal,cutoff_mask);
1595
1596             /* Calculate temporary vectorial force */
1597             tx               = _mm_mul_pd(fscal,dx11);
1598             ty               = _mm_mul_pd(fscal,dy11);
1599             tz               = _mm_mul_pd(fscal,dz11);
1600
1601             /* Update vectorial force */
1602             fix1             = _mm_add_pd(fix1,tx);
1603             fiy1             = _mm_add_pd(fiy1,ty);
1604             fiz1             = _mm_add_pd(fiz1,tz);
1605
1606             fjx1             = _mm_add_pd(fjx1,tx);
1607             fjy1             = _mm_add_pd(fjy1,ty);
1608             fjz1             = _mm_add_pd(fjz1,tz);
1609
1610             }
1611
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             if (gmx_mm_any_lt(rsq12,rcutoff2))
1617             {
1618
1619             /* REACTION-FIELD ELECTROSTATICS */
1620             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1621
1622             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1623
1624             fscal            = felec;
1625
1626             fscal            = _mm_and_pd(fscal,cutoff_mask);
1627
1628             /* Calculate temporary vectorial force */
1629             tx               = _mm_mul_pd(fscal,dx12);
1630             ty               = _mm_mul_pd(fscal,dy12);
1631             tz               = _mm_mul_pd(fscal,dz12);
1632
1633             /* Update vectorial force */
1634             fix1             = _mm_add_pd(fix1,tx);
1635             fiy1             = _mm_add_pd(fiy1,ty);
1636             fiz1             = _mm_add_pd(fiz1,tz);
1637
1638             fjx2             = _mm_add_pd(fjx2,tx);
1639             fjy2             = _mm_add_pd(fjy2,ty);
1640             fjz2             = _mm_add_pd(fjz2,tz);
1641
1642             }
1643
1644             /**************************
1645              * CALCULATE INTERACTIONS *
1646              **************************/
1647
1648             if (gmx_mm_any_lt(rsq20,rcutoff2))
1649             {
1650
1651             /* REACTION-FIELD ELECTROSTATICS */
1652             felec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1653
1654             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1655
1656             fscal            = felec;
1657
1658             fscal            = _mm_and_pd(fscal,cutoff_mask);
1659
1660             /* Calculate temporary vectorial force */
1661             tx               = _mm_mul_pd(fscal,dx20);
1662             ty               = _mm_mul_pd(fscal,dy20);
1663             tz               = _mm_mul_pd(fscal,dz20);
1664
1665             /* Update vectorial force */
1666             fix2             = _mm_add_pd(fix2,tx);
1667             fiy2             = _mm_add_pd(fiy2,ty);
1668             fiz2             = _mm_add_pd(fiz2,tz);
1669
1670             fjx0             = _mm_add_pd(fjx0,tx);
1671             fjy0             = _mm_add_pd(fjy0,ty);
1672             fjz0             = _mm_add_pd(fjz0,tz);
1673
1674             }
1675
1676             /**************************
1677              * CALCULATE INTERACTIONS *
1678              **************************/
1679
1680             if (gmx_mm_any_lt(rsq21,rcutoff2))
1681             {
1682
1683             /* REACTION-FIELD ELECTROSTATICS */
1684             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1685
1686             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1687
1688             fscal            = felec;
1689
1690             fscal            = _mm_and_pd(fscal,cutoff_mask);
1691
1692             /* Calculate temporary vectorial force */
1693             tx               = _mm_mul_pd(fscal,dx21);
1694             ty               = _mm_mul_pd(fscal,dy21);
1695             tz               = _mm_mul_pd(fscal,dz21);
1696
1697             /* Update vectorial force */
1698             fix2             = _mm_add_pd(fix2,tx);
1699             fiy2             = _mm_add_pd(fiy2,ty);
1700             fiz2             = _mm_add_pd(fiz2,tz);
1701
1702             fjx1             = _mm_add_pd(fjx1,tx);
1703             fjy1             = _mm_add_pd(fjy1,ty);
1704             fjz1             = _mm_add_pd(fjz1,tz);
1705
1706             }
1707
1708             /**************************
1709              * CALCULATE INTERACTIONS *
1710              **************************/
1711
1712             if (gmx_mm_any_lt(rsq22,rcutoff2))
1713             {
1714
1715             /* REACTION-FIELD ELECTROSTATICS */
1716             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1717
1718             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1719
1720             fscal            = felec;
1721
1722             fscal            = _mm_and_pd(fscal,cutoff_mask);
1723
1724             /* Calculate temporary vectorial force */
1725             tx               = _mm_mul_pd(fscal,dx22);
1726             ty               = _mm_mul_pd(fscal,dy22);
1727             tz               = _mm_mul_pd(fscal,dz22);
1728
1729             /* Update vectorial force */
1730             fix2             = _mm_add_pd(fix2,tx);
1731             fiy2             = _mm_add_pd(fiy2,ty);
1732             fiz2             = _mm_add_pd(fiz2,tz);
1733
1734             fjx2             = _mm_add_pd(fjx2,tx);
1735             fjy2             = _mm_add_pd(fjy2,ty);
1736             fjz2             = _mm_add_pd(fjz2,tz);
1737
1738             }
1739
1740             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1741
1742             /* Inner loop uses 297 flops */
1743         }
1744
1745         if(jidx<j_index_end)
1746         {
1747
1748             jnrA             = jjnr[jidx];
1749             j_coord_offsetA  = DIM*jnrA;
1750
1751             /* load j atom coordinates */
1752             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1753                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1754
1755             /* Calculate displacement vector */
1756             dx00             = _mm_sub_pd(ix0,jx0);
1757             dy00             = _mm_sub_pd(iy0,jy0);
1758             dz00             = _mm_sub_pd(iz0,jz0);
1759             dx01             = _mm_sub_pd(ix0,jx1);
1760             dy01             = _mm_sub_pd(iy0,jy1);
1761             dz01             = _mm_sub_pd(iz0,jz1);
1762             dx02             = _mm_sub_pd(ix0,jx2);
1763             dy02             = _mm_sub_pd(iy0,jy2);
1764             dz02             = _mm_sub_pd(iz0,jz2);
1765             dx10             = _mm_sub_pd(ix1,jx0);
1766             dy10             = _mm_sub_pd(iy1,jy0);
1767             dz10             = _mm_sub_pd(iz1,jz0);
1768             dx11             = _mm_sub_pd(ix1,jx1);
1769             dy11             = _mm_sub_pd(iy1,jy1);
1770             dz11             = _mm_sub_pd(iz1,jz1);
1771             dx12             = _mm_sub_pd(ix1,jx2);
1772             dy12             = _mm_sub_pd(iy1,jy2);
1773             dz12             = _mm_sub_pd(iz1,jz2);
1774             dx20             = _mm_sub_pd(ix2,jx0);
1775             dy20             = _mm_sub_pd(iy2,jy0);
1776             dz20             = _mm_sub_pd(iz2,jz0);
1777             dx21             = _mm_sub_pd(ix2,jx1);
1778             dy21             = _mm_sub_pd(iy2,jy1);
1779             dz21             = _mm_sub_pd(iz2,jz1);
1780             dx22             = _mm_sub_pd(ix2,jx2);
1781             dy22             = _mm_sub_pd(iy2,jy2);
1782             dz22             = _mm_sub_pd(iz2,jz2);
1783
1784             /* Calculate squared distance and things based on it */
1785             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1786             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1787             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1788             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1789             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1790             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1791             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1792             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1793             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1794
1795             rinv00           = sse2_invsqrt_d(rsq00);
1796             rinv01           = sse2_invsqrt_d(rsq01);
1797             rinv02           = sse2_invsqrt_d(rsq02);
1798             rinv10           = sse2_invsqrt_d(rsq10);
1799             rinv11           = sse2_invsqrt_d(rsq11);
1800             rinv12           = sse2_invsqrt_d(rsq12);
1801             rinv20           = sse2_invsqrt_d(rsq20);
1802             rinv21           = sse2_invsqrt_d(rsq21);
1803             rinv22           = sse2_invsqrt_d(rsq22);
1804
1805             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1806             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1807             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1808             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1809             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1810             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1811             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1812             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1813             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1814
1815             fjx0             = _mm_setzero_pd();
1816             fjy0             = _mm_setzero_pd();
1817             fjz0             = _mm_setzero_pd();
1818             fjx1             = _mm_setzero_pd();
1819             fjy1             = _mm_setzero_pd();
1820             fjz1             = _mm_setzero_pd();
1821             fjx2             = _mm_setzero_pd();
1822             fjy2             = _mm_setzero_pd();
1823             fjz2             = _mm_setzero_pd();
1824
1825             /**************************
1826              * CALCULATE INTERACTIONS *
1827              **************************/
1828
1829             if (gmx_mm_any_lt(rsq00,rcutoff2))
1830             {
1831
1832             r00              = _mm_mul_pd(rsq00,rinv00);
1833
1834             /* Calculate table index by multiplying r with table scale and truncate to integer */
1835             rt               = _mm_mul_pd(r00,vftabscale);
1836             vfitab           = _mm_cvttpd_epi32(rt);
1837             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1838             vfitab           = _mm_slli_epi32(vfitab,3);
1839
1840             /* REACTION-FIELD ELECTROSTATICS */
1841             felec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1842
1843             /* CUBIC SPLINE TABLE DISPERSION */
1844             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1845             F                = _mm_setzero_pd();
1846             GMX_MM_TRANSPOSE2_PD(Y,F);
1847             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1848             H                = _mm_setzero_pd();
1849             GMX_MM_TRANSPOSE2_PD(G,H);
1850             Heps             = _mm_mul_pd(vfeps,H);
1851             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1852             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1853             fvdw6            = _mm_mul_pd(c6_00,FF);
1854
1855             /* CUBIC SPLINE TABLE REPULSION */
1856             vfitab           = _mm_add_epi32(vfitab,ifour);
1857             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1858             F                = _mm_setzero_pd();
1859             GMX_MM_TRANSPOSE2_PD(Y,F);
1860             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1861             H                = _mm_setzero_pd();
1862             GMX_MM_TRANSPOSE2_PD(G,H);
1863             Heps             = _mm_mul_pd(vfeps,H);
1864             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1865             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1866             fvdw12           = _mm_mul_pd(c12_00,FF);
1867             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1868
1869             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1870
1871             fscal            = _mm_add_pd(felec,fvdw);
1872
1873             fscal            = _mm_and_pd(fscal,cutoff_mask);
1874
1875             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1876
1877             /* Calculate temporary vectorial force */
1878             tx               = _mm_mul_pd(fscal,dx00);
1879             ty               = _mm_mul_pd(fscal,dy00);
1880             tz               = _mm_mul_pd(fscal,dz00);
1881
1882             /* Update vectorial force */
1883             fix0             = _mm_add_pd(fix0,tx);
1884             fiy0             = _mm_add_pd(fiy0,ty);
1885             fiz0             = _mm_add_pd(fiz0,tz);
1886
1887             fjx0             = _mm_add_pd(fjx0,tx);
1888             fjy0             = _mm_add_pd(fjy0,ty);
1889             fjz0             = _mm_add_pd(fjz0,tz);
1890
1891             }
1892
1893             /**************************
1894              * CALCULATE INTERACTIONS *
1895              **************************/
1896
1897             if (gmx_mm_any_lt(rsq01,rcutoff2))
1898             {
1899
1900             /* REACTION-FIELD ELECTROSTATICS */
1901             felec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1902
1903             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1904
1905             fscal            = felec;
1906
1907             fscal            = _mm_and_pd(fscal,cutoff_mask);
1908
1909             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1910
1911             /* Calculate temporary vectorial force */
1912             tx               = _mm_mul_pd(fscal,dx01);
1913             ty               = _mm_mul_pd(fscal,dy01);
1914             tz               = _mm_mul_pd(fscal,dz01);
1915
1916             /* Update vectorial force */
1917             fix0             = _mm_add_pd(fix0,tx);
1918             fiy0             = _mm_add_pd(fiy0,ty);
1919             fiz0             = _mm_add_pd(fiz0,tz);
1920
1921             fjx1             = _mm_add_pd(fjx1,tx);
1922             fjy1             = _mm_add_pd(fjy1,ty);
1923             fjz1             = _mm_add_pd(fjz1,tz);
1924
1925             }
1926
1927             /**************************
1928              * CALCULATE INTERACTIONS *
1929              **************************/
1930
1931             if (gmx_mm_any_lt(rsq02,rcutoff2))
1932             {
1933
1934             /* REACTION-FIELD ELECTROSTATICS */
1935             felec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1936
1937             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1938
1939             fscal            = felec;
1940
1941             fscal            = _mm_and_pd(fscal,cutoff_mask);
1942
1943             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1944
1945             /* Calculate temporary vectorial force */
1946             tx               = _mm_mul_pd(fscal,dx02);
1947             ty               = _mm_mul_pd(fscal,dy02);
1948             tz               = _mm_mul_pd(fscal,dz02);
1949
1950             /* Update vectorial force */
1951             fix0             = _mm_add_pd(fix0,tx);
1952             fiy0             = _mm_add_pd(fiy0,ty);
1953             fiz0             = _mm_add_pd(fiz0,tz);
1954
1955             fjx2             = _mm_add_pd(fjx2,tx);
1956             fjy2             = _mm_add_pd(fjy2,ty);
1957             fjz2             = _mm_add_pd(fjz2,tz);
1958
1959             }
1960
1961             /**************************
1962              * CALCULATE INTERACTIONS *
1963              **************************/
1964
1965             if (gmx_mm_any_lt(rsq10,rcutoff2))
1966             {
1967
1968             /* REACTION-FIELD ELECTROSTATICS */
1969             felec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1970
1971             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1972
1973             fscal            = felec;
1974
1975             fscal            = _mm_and_pd(fscal,cutoff_mask);
1976
1977             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1978
1979             /* Calculate temporary vectorial force */
1980             tx               = _mm_mul_pd(fscal,dx10);
1981             ty               = _mm_mul_pd(fscal,dy10);
1982             tz               = _mm_mul_pd(fscal,dz10);
1983
1984             /* Update vectorial force */
1985             fix1             = _mm_add_pd(fix1,tx);
1986             fiy1             = _mm_add_pd(fiy1,ty);
1987             fiz1             = _mm_add_pd(fiz1,tz);
1988
1989             fjx0             = _mm_add_pd(fjx0,tx);
1990             fjy0             = _mm_add_pd(fjy0,ty);
1991             fjz0             = _mm_add_pd(fjz0,tz);
1992
1993             }
1994
1995             /**************************
1996              * CALCULATE INTERACTIONS *
1997              **************************/
1998
1999             if (gmx_mm_any_lt(rsq11,rcutoff2))
2000             {
2001
2002             /* REACTION-FIELD ELECTROSTATICS */
2003             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
2004
2005             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2006
2007             fscal            = felec;
2008
2009             fscal            = _mm_and_pd(fscal,cutoff_mask);
2010
2011             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2012
2013             /* Calculate temporary vectorial force */
2014             tx               = _mm_mul_pd(fscal,dx11);
2015             ty               = _mm_mul_pd(fscal,dy11);
2016             tz               = _mm_mul_pd(fscal,dz11);
2017
2018             /* Update vectorial force */
2019             fix1             = _mm_add_pd(fix1,tx);
2020             fiy1             = _mm_add_pd(fiy1,ty);
2021             fiz1             = _mm_add_pd(fiz1,tz);
2022
2023             fjx1             = _mm_add_pd(fjx1,tx);
2024             fjy1             = _mm_add_pd(fjy1,ty);
2025             fjz1             = _mm_add_pd(fjz1,tz);
2026
2027             }
2028
2029             /**************************
2030              * CALCULATE INTERACTIONS *
2031              **************************/
2032
2033             if (gmx_mm_any_lt(rsq12,rcutoff2))
2034             {
2035
2036             /* REACTION-FIELD ELECTROSTATICS */
2037             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
2038
2039             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2040
2041             fscal            = felec;
2042
2043             fscal            = _mm_and_pd(fscal,cutoff_mask);
2044
2045             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2046
2047             /* Calculate temporary vectorial force */
2048             tx               = _mm_mul_pd(fscal,dx12);
2049             ty               = _mm_mul_pd(fscal,dy12);
2050             tz               = _mm_mul_pd(fscal,dz12);
2051
2052             /* Update vectorial force */
2053             fix1             = _mm_add_pd(fix1,tx);
2054             fiy1             = _mm_add_pd(fiy1,ty);
2055             fiz1             = _mm_add_pd(fiz1,tz);
2056
2057             fjx2             = _mm_add_pd(fjx2,tx);
2058             fjy2             = _mm_add_pd(fjy2,ty);
2059             fjz2             = _mm_add_pd(fjz2,tz);
2060
2061             }
2062
2063             /**************************
2064              * CALCULATE INTERACTIONS *
2065              **************************/
2066
2067             if (gmx_mm_any_lt(rsq20,rcutoff2))
2068             {
2069
2070             /* REACTION-FIELD ELECTROSTATICS */
2071             felec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
2072
2073             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
2074
2075             fscal            = felec;
2076
2077             fscal            = _mm_and_pd(fscal,cutoff_mask);
2078
2079             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2080
2081             /* Calculate temporary vectorial force */
2082             tx               = _mm_mul_pd(fscal,dx20);
2083             ty               = _mm_mul_pd(fscal,dy20);
2084             tz               = _mm_mul_pd(fscal,dz20);
2085
2086             /* Update vectorial force */
2087             fix2             = _mm_add_pd(fix2,tx);
2088             fiy2             = _mm_add_pd(fiy2,ty);
2089             fiz2             = _mm_add_pd(fiz2,tz);
2090
2091             fjx0             = _mm_add_pd(fjx0,tx);
2092             fjy0             = _mm_add_pd(fjy0,ty);
2093             fjz0             = _mm_add_pd(fjz0,tz);
2094
2095             }
2096
2097             /**************************
2098              * CALCULATE INTERACTIONS *
2099              **************************/
2100
2101             if (gmx_mm_any_lt(rsq21,rcutoff2))
2102             {
2103
2104             /* REACTION-FIELD ELECTROSTATICS */
2105             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
2106
2107             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2108
2109             fscal            = felec;
2110
2111             fscal            = _mm_and_pd(fscal,cutoff_mask);
2112
2113             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2114
2115             /* Calculate temporary vectorial force */
2116             tx               = _mm_mul_pd(fscal,dx21);
2117             ty               = _mm_mul_pd(fscal,dy21);
2118             tz               = _mm_mul_pd(fscal,dz21);
2119
2120             /* Update vectorial force */
2121             fix2             = _mm_add_pd(fix2,tx);
2122             fiy2             = _mm_add_pd(fiy2,ty);
2123             fiz2             = _mm_add_pd(fiz2,tz);
2124
2125             fjx1             = _mm_add_pd(fjx1,tx);
2126             fjy1             = _mm_add_pd(fjy1,ty);
2127             fjz1             = _mm_add_pd(fjz1,tz);
2128
2129             }
2130
2131             /**************************
2132              * CALCULATE INTERACTIONS *
2133              **************************/
2134
2135             if (gmx_mm_any_lt(rsq22,rcutoff2))
2136             {
2137
2138             /* REACTION-FIELD ELECTROSTATICS */
2139             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2140
2141             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2142
2143             fscal            = felec;
2144
2145             fscal            = _mm_and_pd(fscal,cutoff_mask);
2146
2147             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2148
2149             /* Calculate temporary vectorial force */
2150             tx               = _mm_mul_pd(fscal,dx22);
2151             ty               = _mm_mul_pd(fscal,dy22);
2152             tz               = _mm_mul_pd(fscal,dz22);
2153
2154             /* Update vectorial force */
2155             fix2             = _mm_add_pd(fix2,tx);
2156             fiy2             = _mm_add_pd(fiy2,ty);
2157             fiz2             = _mm_add_pd(fiz2,tz);
2158
2159             fjx2             = _mm_add_pd(fjx2,tx);
2160             fjy2             = _mm_add_pd(fjy2,ty);
2161             fjz2             = _mm_add_pd(fjz2,tz);
2162
2163             }
2164
2165             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2166
2167             /* Inner loop uses 297 flops */
2168         }
2169
2170         /* End of innermost loop */
2171
2172         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2173                                               f+i_coord_offset,fshift+i_shift_offset);
2174
2175         /* Increment number of inner iterations */
2176         inneriter                  += j_index_end - j_index_start;
2177
2178         /* Outer loop uses 18 flops */
2179     }
2180
2181     /* Increment number of outer iterations */
2182     outeriter        += nri;
2183
2184     /* Update outer/inner flops */
2185
2186     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
2187 }