83b4033d71d1e88fa5c195e74a740274572dc3f6
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sse4_1_double.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_double
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwioffset3;
73     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74     int              vdwjidx0A,vdwjidx0B;
75     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76     int              vdwjidx1A,vdwjidx1B;
77     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78     int              vdwjidx2A,vdwjidx2B;
79     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80     int              vdwjidx3A,vdwjidx3B;
81     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
93     real             *charge;
94     int              nvdwtype;
95     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96     int              *vdwtype;
97     real             *vdwparam;
98     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
99     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
100     __m128i          vfitab;
101     __m128i          ifour       = _mm_set1_epi32(4);
102     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
103     real             *vftab;
104     __m128d          dummy_mask,cutoff_mask;
105     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106     __m128d          one     = _mm_set1_pd(1.0);
107     __m128d          two     = _mm_set1_pd(2.0);
108     x                = xx[0];
109     f                = ff[0];
110
111     nri              = nlist->nri;
112     iinr             = nlist->iinr;
113     jindex           = nlist->jindex;
114     jjnr             = nlist->jjnr;
115     shiftidx         = nlist->shift;
116     gid              = nlist->gid;
117     shiftvec         = fr->shift_vec[0];
118     fshift           = fr->fshift[0];
119     facel            = _mm_set1_pd(fr->epsfac);
120     charge           = mdatoms->chargeA;
121     krf              = _mm_set1_pd(fr->ic->k_rf);
122     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
123     crf              = _mm_set1_pd(fr->ic->c_rf);
124     nvdwtype         = fr->ntype;
125     vdwparam         = fr->nbfp;
126     vdwtype          = mdatoms->typeA;
127
128     vftab            = kernel_data->table_vdw->data;
129     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
130
131     /* Setup water-specific parameters */
132     inr              = nlist->iinr[0];
133     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
134     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
135     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
136     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
137
138     jq1              = _mm_set1_pd(charge[inr+1]);
139     jq2              = _mm_set1_pd(charge[inr+2]);
140     jq3              = _mm_set1_pd(charge[inr+3]);
141     vdwjidx0A        = 2*vdwtype[inr+0];
142     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
143     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
144     qq11             = _mm_mul_pd(iq1,jq1);
145     qq12             = _mm_mul_pd(iq1,jq2);
146     qq13             = _mm_mul_pd(iq1,jq3);
147     qq21             = _mm_mul_pd(iq2,jq1);
148     qq22             = _mm_mul_pd(iq2,jq2);
149     qq23             = _mm_mul_pd(iq2,jq3);
150     qq31             = _mm_mul_pd(iq3,jq1);
151     qq32             = _mm_mul_pd(iq3,jq2);
152     qq33             = _mm_mul_pd(iq3,jq3);
153
154     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155     rcutoff_scalar   = fr->rcoulomb;
156     rcutoff          = _mm_set1_pd(rcutoff_scalar);
157     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
158
159     /* Avoid stupid compiler warnings */
160     jnrA = jnrB = 0;
161     j_coord_offsetA = 0;
162     j_coord_offsetB = 0;
163
164     outeriter        = 0;
165     inneriter        = 0;
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
184
185         fix0             = _mm_setzero_pd();
186         fiy0             = _mm_setzero_pd();
187         fiz0             = _mm_setzero_pd();
188         fix1             = _mm_setzero_pd();
189         fiy1             = _mm_setzero_pd();
190         fiz1             = _mm_setzero_pd();
191         fix2             = _mm_setzero_pd();
192         fiy2             = _mm_setzero_pd();
193         fiz2             = _mm_setzero_pd();
194         fix3             = _mm_setzero_pd();
195         fiy3             = _mm_setzero_pd();
196         fiz3             = _mm_setzero_pd();
197
198         /* Reset potential sums */
199         velecsum         = _mm_setzero_pd();
200         vvdwsum          = _mm_setzero_pd();
201
202         /* Start inner kernel loop */
203         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
204         {
205
206             /* Get j neighbor index, and coordinate index */
207             jnrA             = jjnr[jidx];
208             jnrB             = jjnr[jidx+1];
209             j_coord_offsetA  = DIM*jnrA;
210             j_coord_offsetB  = DIM*jnrB;
211
212             /* load j atom coordinates */
213             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
215                                               &jy2,&jz2,&jx3,&jy3,&jz3);
216
217             /* Calculate displacement vector */
218             dx00             = _mm_sub_pd(ix0,jx0);
219             dy00             = _mm_sub_pd(iy0,jy0);
220             dz00             = _mm_sub_pd(iz0,jz0);
221             dx11             = _mm_sub_pd(ix1,jx1);
222             dy11             = _mm_sub_pd(iy1,jy1);
223             dz11             = _mm_sub_pd(iz1,jz1);
224             dx12             = _mm_sub_pd(ix1,jx2);
225             dy12             = _mm_sub_pd(iy1,jy2);
226             dz12             = _mm_sub_pd(iz1,jz2);
227             dx13             = _mm_sub_pd(ix1,jx3);
228             dy13             = _mm_sub_pd(iy1,jy3);
229             dz13             = _mm_sub_pd(iz1,jz3);
230             dx21             = _mm_sub_pd(ix2,jx1);
231             dy21             = _mm_sub_pd(iy2,jy1);
232             dz21             = _mm_sub_pd(iz2,jz1);
233             dx22             = _mm_sub_pd(ix2,jx2);
234             dy22             = _mm_sub_pd(iy2,jy2);
235             dz22             = _mm_sub_pd(iz2,jz2);
236             dx23             = _mm_sub_pd(ix2,jx3);
237             dy23             = _mm_sub_pd(iy2,jy3);
238             dz23             = _mm_sub_pd(iz2,jz3);
239             dx31             = _mm_sub_pd(ix3,jx1);
240             dy31             = _mm_sub_pd(iy3,jy1);
241             dz31             = _mm_sub_pd(iz3,jz1);
242             dx32             = _mm_sub_pd(ix3,jx2);
243             dy32             = _mm_sub_pd(iy3,jy2);
244             dz32             = _mm_sub_pd(iz3,jz2);
245             dx33             = _mm_sub_pd(ix3,jx3);
246             dy33             = _mm_sub_pd(iy3,jy3);
247             dz33             = _mm_sub_pd(iz3,jz3);
248
249             /* Calculate squared distance and things based on it */
250             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
251             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
252             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
253             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
254             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
255             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
256             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
257             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
258             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
259             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
260
261             rinv00           = gmx_mm_invsqrt_pd(rsq00);
262             rinv11           = gmx_mm_invsqrt_pd(rsq11);
263             rinv12           = gmx_mm_invsqrt_pd(rsq12);
264             rinv13           = gmx_mm_invsqrt_pd(rsq13);
265             rinv21           = gmx_mm_invsqrt_pd(rsq21);
266             rinv22           = gmx_mm_invsqrt_pd(rsq22);
267             rinv23           = gmx_mm_invsqrt_pd(rsq23);
268             rinv31           = gmx_mm_invsqrt_pd(rsq31);
269             rinv32           = gmx_mm_invsqrt_pd(rsq32);
270             rinv33           = gmx_mm_invsqrt_pd(rsq33);
271
272             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
273             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
274             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
275             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
276             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
277             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
278             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
279             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
280             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
281
282             fjx0             = _mm_setzero_pd();
283             fjy0             = _mm_setzero_pd();
284             fjz0             = _mm_setzero_pd();
285             fjx1             = _mm_setzero_pd();
286             fjy1             = _mm_setzero_pd();
287             fjz1             = _mm_setzero_pd();
288             fjx2             = _mm_setzero_pd();
289             fjy2             = _mm_setzero_pd();
290             fjz2             = _mm_setzero_pd();
291             fjx3             = _mm_setzero_pd();
292             fjy3             = _mm_setzero_pd();
293             fjz3             = _mm_setzero_pd();
294
295             /**************************
296              * CALCULATE INTERACTIONS *
297              **************************/
298
299             r00              = _mm_mul_pd(rsq00,rinv00);
300
301             /* Calculate table index by multiplying r with table scale and truncate to integer */
302             rt               = _mm_mul_pd(r00,vftabscale);
303             vfitab           = _mm_cvttpd_epi32(rt);
304             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
305             vfitab           = _mm_slli_epi32(vfitab,3);
306
307             /* CUBIC SPLINE TABLE DISPERSION */
308             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
309             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
310             GMX_MM_TRANSPOSE2_PD(Y,F);
311             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
312             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
313             GMX_MM_TRANSPOSE2_PD(G,H);
314             Heps             = _mm_mul_pd(vfeps,H);
315             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
316             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
317             vvdw6            = _mm_mul_pd(c6_00,VV);
318             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
319             fvdw6            = _mm_mul_pd(c6_00,FF);
320
321             /* CUBIC SPLINE TABLE REPULSION */
322             vfitab           = _mm_add_epi32(vfitab,ifour);
323             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
324             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
325             GMX_MM_TRANSPOSE2_PD(Y,F);
326             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
327             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
328             GMX_MM_TRANSPOSE2_PD(G,H);
329             Heps             = _mm_mul_pd(vfeps,H);
330             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
331             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
332             vvdw12           = _mm_mul_pd(c12_00,VV);
333             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
334             fvdw12           = _mm_mul_pd(c12_00,FF);
335             vvdw             = _mm_add_pd(vvdw12,vvdw6);
336             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
337
338             /* Update potential sum for this i atom from the interaction with this j atom. */
339             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
340
341             fscal            = fvdw;
342
343             /* Calculate temporary vectorial force */
344             tx               = _mm_mul_pd(fscal,dx00);
345             ty               = _mm_mul_pd(fscal,dy00);
346             tz               = _mm_mul_pd(fscal,dz00);
347
348             /* Update vectorial force */
349             fix0             = _mm_add_pd(fix0,tx);
350             fiy0             = _mm_add_pd(fiy0,ty);
351             fiz0             = _mm_add_pd(fiz0,tz);
352
353             fjx0             = _mm_add_pd(fjx0,tx);
354             fjy0             = _mm_add_pd(fjy0,ty);
355             fjz0             = _mm_add_pd(fjz0,tz);
356
357             /**************************
358              * CALCULATE INTERACTIONS *
359              **************************/
360
361             if (gmx_mm_any_lt(rsq11,rcutoff2))
362             {
363
364             /* REACTION-FIELD ELECTROSTATICS */
365             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
366             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
367
368             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
369
370             /* Update potential sum for this i atom from the interaction with this j atom. */
371             velec            = _mm_and_pd(velec,cutoff_mask);
372             velecsum         = _mm_add_pd(velecsum,velec);
373
374             fscal            = felec;
375
376             fscal            = _mm_and_pd(fscal,cutoff_mask);
377
378             /* Calculate temporary vectorial force */
379             tx               = _mm_mul_pd(fscal,dx11);
380             ty               = _mm_mul_pd(fscal,dy11);
381             tz               = _mm_mul_pd(fscal,dz11);
382
383             /* Update vectorial force */
384             fix1             = _mm_add_pd(fix1,tx);
385             fiy1             = _mm_add_pd(fiy1,ty);
386             fiz1             = _mm_add_pd(fiz1,tz);
387
388             fjx1             = _mm_add_pd(fjx1,tx);
389             fjy1             = _mm_add_pd(fjy1,ty);
390             fjz1             = _mm_add_pd(fjz1,tz);
391
392             }
393
394             /**************************
395              * CALCULATE INTERACTIONS *
396              **************************/
397
398             if (gmx_mm_any_lt(rsq12,rcutoff2))
399             {
400
401             /* REACTION-FIELD ELECTROSTATICS */
402             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
403             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
404
405             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
406
407             /* Update potential sum for this i atom from the interaction with this j atom. */
408             velec            = _mm_and_pd(velec,cutoff_mask);
409             velecsum         = _mm_add_pd(velecsum,velec);
410
411             fscal            = felec;
412
413             fscal            = _mm_and_pd(fscal,cutoff_mask);
414
415             /* Calculate temporary vectorial force */
416             tx               = _mm_mul_pd(fscal,dx12);
417             ty               = _mm_mul_pd(fscal,dy12);
418             tz               = _mm_mul_pd(fscal,dz12);
419
420             /* Update vectorial force */
421             fix1             = _mm_add_pd(fix1,tx);
422             fiy1             = _mm_add_pd(fiy1,ty);
423             fiz1             = _mm_add_pd(fiz1,tz);
424
425             fjx2             = _mm_add_pd(fjx2,tx);
426             fjy2             = _mm_add_pd(fjy2,ty);
427             fjz2             = _mm_add_pd(fjz2,tz);
428
429             }
430
431             /**************************
432              * CALCULATE INTERACTIONS *
433              **************************/
434
435             if (gmx_mm_any_lt(rsq13,rcutoff2))
436             {
437
438             /* REACTION-FIELD ELECTROSTATICS */
439             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
440             felec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
441
442             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
443
444             /* Update potential sum for this i atom from the interaction with this j atom. */
445             velec            = _mm_and_pd(velec,cutoff_mask);
446             velecsum         = _mm_add_pd(velecsum,velec);
447
448             fscal            = felec;
449
450             fscal            = _mm_and_pd(fscal,cutoff_mask);
451
452             /* Calculate temporary vectorial force */
453             tx               = _mm_mul_pd(fscal,dx13);
454             ty               = _mm_mul_pd(fscal,dy13);
455             tz               = _mm_mul_pd(fscal,dz13);
456
457             /* Update vectorial force */
458             fix1             = _mm_add_pd(fix1,tx);
459             fiy1             = _mm_add_pd(fiy1,ty);
460             fiz1             = _mm_add_pd(fiz1,tz);
461
462             fjx3             = _mm_add_pd(fjx3,tx);
463             fjy3             = _mm_add_pd(fjy3,ty);
464             fjz3             = _mm_add_pd(fjz3,tz);
465
466             }
467
468             /**************************
469              * CALCULATE INTERACTIONS *
470              **************************/
471
472             if (gmx_mm_any_lt(rsq21,rcutoff2))
473             {
474
475             /* REACTION-FIELD ELECTROSTATICS */
476             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
477             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
478
479             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
480
481             /* Update potential sum for this i atom from the interaction with this j atom. */
482             velec            = _mm_and_pd(velec,cutoff_mask);
483             velecsum         = _mm_add_pd(velecsum,velec);
484
485             fscal            = felec;
486
487             fscal            = _mm_and_pd(fscal,cutoff_mask);
488
489             /* Calculate temporary vectorial force */
490             tx               = _mm_mul_pd(fscal,dx21);
491             ty               = _mm_mul_pd(fscal,dy21);
492             tz               = _mm_mul_pd(fscal,dz21);
493
494             /* Update vectorial force */
495             fix2             = _mm_add_pd(fix2,tx);
496             fiy2             = _mm_add_pd(fiy2,ty);
497             fiz2             = _mm_add_pd(fiz2,tz);
498
499             fjx1             = _mm_add_pd(fjx1,tx);
500             fjy1             = _mm_add_pd(fjy1,ty);
501             fjz1             = _mm_add_pd(fjz1,tz);
502
503             }
504
505             /**************************
506              * CALCULATE INTERACTIONS *
507              **************************/
508
509             if (gmx_mm_any_lt(rsq22,rcutoff2))
510             {
511
512             /* REACTION-FIELD ELECTROSTATICS */
513             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
514             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
515
516             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
517
518             /* Update potential sum for this i atom from the interaction with this j atom. */
519             velec            = _mm_and_pd(velec,cutoff_mask);
520             velecsum         = _mm_add_pd(velecsum,velec);
521
522             fscal            = felec;
523
524             fscal            = _mm_and_pd(fscal,cutoff_mask);
525
526             /* Calculate temporary vectorial force */
527             tx               = _mm_mul_pd(fscal,dx22);
528             ty               = _mm_mul_pd(fscal,dy22);
529             tz               = _mm_mul_pd(fscal,dz22);
530
531             /* Update vectorial force */
532             fix2             = _mm_add_pd(fix2,tx);
533             fiy2             = _mm_add_pd(fiy2,ty);
534             fiz2             = _mm_add_pd(fiz2,tz);
535
536             fjx2             = _mm_add_pd(fjx2,tx);
537             fjy2             = _mm_add_pd(fjy2,ty);
538             fjz2             = _mm_add_pd(fjz2,tz);
539
540             }
541
542             /**************************
543              * CALCULATE INTERACTIONS *
544              **************************/
545
546             if (gmx_mm_any_lt(rsq23,rcutoff2))
547             {
548
549             /* REACTION-FIELD ELECTROSTATICS */
550             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
551             felec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
552
553             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
554
555             /* Update potential sum for this i atom from the interaction with this j atom. */
556             velec            = _mm_and_pd(velec,cutoff_mask);
557             velecsum         = _mm_add_pd(velecsum,velec);
558
559             fscal            = felec;
560
561             fscal            = _mm_and_pd(fscal,cutoff_mask);
562
563             /* Calculate temporary vectorial force */
564             tx               = _mm_mul_pd(fscal,dx23);
565             ty               = _mm_mul_pd(fscal,dy23);
566             tz               = _mm_mul_pd(fscal,dz23);
567
568             /* Update vectorial force */
569             fix2             = _mm_add_pd(fix2,tx);
570             fiy2             = _mm_add_pd(fiy2,ty);
571             fiz2             = _mm_add_pd(fiz2,tz);
572
573             fjx3             = _mm_add_pd(fjx3,tx);
574             fjy3             = _mm_add_pd(fjy3,ty);
575             fjz3             = _mm_add_pd(fjz3,tz);
576
577             }
578
579             /**************************
580              * CALCULATE INTERACTIONS *
581              **************************/
582
583             if (gmx_mm_any_lt(rsq31,rcutoff2))
584             {
585
586             /* REACTION-FIELD ELECTROSTATICS */
587             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
588             felec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
589
590             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
591
592             /* Update potential sum for this i atom from the interaction with this j atom. */
593             velec            = _mm_and_pd(velec,cutoff_mask);
594             velecsum         = _mm_add_pd(velecsum,velec);
595
596             fscal            = felec;
597
598             fscal            = _mm_and_pd(fscal,cutoff_mask);
599
600             /* Calculate temporary vectorial force */
601             tx               = _mm_mul_pd(fscal,dx31);
602             ty               = _mm_mul_pd(fscal,dy31);
603             tz               = _mm_mul_pd(fscal,dz31);
604
605             /* Update vectorial force */
606             fix3             = _mm_add_pd(fix3,tx);
607             fiy3             = _mm_add_pd(fiy3,ty);
608             fiz3             = _mm_add_pd(fiz3,tz);
609
610             fjx1             = _mm_add_pd(fjx1,tx);
611             fjy1             = _mm_add_pd(fjy1,ty);
612             fjz1             = _mm_add_pd(fjz1,tz);
613
614             }
615
616             /**************************
617              * CALCULATE INTERACTIONS *
618              **************************/
619
620             if (gmx_mm_any_lt(rsq32,rcutoff2))
621             {
622
623             /* REACTION-FIELD ELECTROSTATICS */
624             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
625             felec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
626
627             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
628
629             /* Update potential sum for this i atom from the interaction with this j atom. */
630             velec            = _mm_and_pd(velec,cutoff_mask);
631             velecsum         = _mm_add_pd(velecsum,velec);
632
633             fscal            = felec;
634
635             fscal            = _mm_and_pd(fscal,cutoff_mask);
636
637             /* Calculate temporary vectorial force */
638             tx               = _mm_mul_pd(fscal,dx32);
639             ty               = _mm_mul_pd(fscal,dy32);
640             tz               = _mm_mul_pd(fscal,dz32);
641
642             /* Update vectorial force */
643             fix3             = _mm_add_pd(fix3,tx);
644             fiy3             = _mm_add_pd(fiy3,ty);
645             fiz3             = _mm_add_pd(fiz3,tz);
646
647             fjx2             = _mm_add_pd(fjx2,tx);
648             fjy2             = _mm_add_pd(fjy2,ty);
649             fjz2             = _mm_add_pd(fjz2,tz);
650
651             }
652
653             /**************************
654              * CALCULATE INTERACTIONS *
655              **************************/
656
657             if (gmx_mm_any_lt(rsq33,rcutoff2))
658             {
659
660             /* REACTION-FIELD ELECTROSTATICS */
661             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
662             felec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
663
664             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
665
666             /* Update potential sum for this i atom from the interaction with this j atom. */
667             velec            = _mm_and_pd(velec,cutoff_mask);
668             velecsum         = _mm_add_pd(velecsum,velec);
669
670             fscal            = felec;
671
672             fscal            = _mm_and_pd(fscal,cutoff_mask);
673
674             /* Calculate temporary vectorial force */
675             tx               = _mm_mul_pd(fscal,dx33);
676             ty               = _mm_mul_pd(fscal,dy33);
677             tz               = _mm_mul_pd(fscal,dz33);
678
679             /* Update vectorial force */
680             fix3             = _mm_add_pd(fix3,tx);
681             fiy3             = _mm_add_pd(fiy3,ty);
682             fiz3             = _mm_add_pd(fiz3,tz);
683
684             fjx3             = _mm_add_pd(fjx3,tx);
685             fjy3             = _mm_add_pd(fjy3,ty);
686             fjz3             = _mm_add_pd(fjz3,tz);
687
688             }
689
690             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
691
692             /* Inner loop uses 383 flops */
693         }
694
695         if(jidx<j_index_end)
696         {
697
698             jnrA             = jjnr[jidx];
699             j_coord_offsetA  = DIM*jnrA;
700
701             /* load j atom coordinates */
702             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
703                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
704                                               &jy2,&jz2,&jx3,&jy3,&jz3);
705
706             /* Calculate displacement vector */
707             dx00             = _mm_sub_pd(ix0,jx0);
708             dy00             = _mm_sub_pd(iy0,jy0);
709             dz00             = _mm_sub_pd(iz0,jz0);
710             dx11             = _mm_sub_pd(ix1,jx1);
711             dy11             = _mm_sub_pd(iy1,jy1);
712             dz11             = _mm_sub_pd(iz1,jz1);
713             dx12             = _mm_sub_pd(ix1,jx2);
714             dy12             = _mm_sub_pd(iy1,jy2);
715             dz12             = _mm_sub_pd(iz1,jz2);
716             dx13             = _mm_sub_pd(ix1,jx3);
717             dy13             = _mm_sub_pd(iy1,jy3);
718             dz13             = _mm_sub_pd(iz1,jz3);
719             dx21             = _mm_sub_pd(ix2,jx1);
720             dy21             = _mm_sub_pd(iy2,jy1);
721             dz21             = _mm_sub_pd(iz2,jz1);
722             dx22             = _mm_sub_pd(ix2,jx2);
723             dy22             = _mm_sub_pd(iy2,jy2);
724             dz22             = _mm_sub_pd(iz2,jz2);
725             dx23             = _mm_sub_pd(ix2,jx3);
726             dy23             = _mm_sub_pd(iy2,jy3);
727             dz23             = _mm_sub_pd(iz2,jz3);
728             dx31             = _mm_sub_pd(ix3,jx1);
729             dy31             = _mm_sub_pd(iy3,jy1);
730             dz31             = _mm_sub_pd(iz3,jz1);
731             dx32             = _mm_sub_pd(ix3,jx2);
732             dy32             = _mm_sub_pd(iy3,jy2);
733             dz32             = _mm_sub_pd(iz3,jz2);
734             dx33             = _mm_sub_pd(ix3,jx3);
735             dy33             = _mm_sub_pd(iy3,jy3);
736             dz33             = _mm_sub_pd(iz3,jz3);
737
738             /* Calculate squared distance and things based on it */
739             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
740             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
741             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
742             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
743             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
744             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
745             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
746             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
747             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
748             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
749
750             rinv00           = gmx_mm_invsqrt_pd(rsq00);
751             rinv11           = gmx_mm_invsqrt_pd(rsq11);
752             rinv12           = gmx_mm_invsqrt_pd(rsq12);
753             rinv13           = gmx_mm_invsqrt_pd(rsq13);
754             rinv21           = gmx_mm_invsqrt_pd(rsq21);
755             rinv22           = gmx_mm_invsqrt_pd(rsq22);
756             rinv23           = gmx_mm_invsqrt_pd(rsq23);
757             rinv31           = gmx_mm_invsqrt_pd(rsq31);
758             rinv32           = gmx_mm_invsqrt_pd(rsq32);
759             rinv33           = gmx_mm_invsqrt_pd(rsq33);
760
761             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
762             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
763             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
764             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
765             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
766             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
767             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
768             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
769             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
770
771             fjx0             = _mm_setzero_pd();
772             fjy0             = _mm_setzero_pd();
773             fjz0             = _mm_setzero_pd();
774             fjx1             = _mm_setzero_pd();
775             fjy1             = _mm_setzero_pd();
776             fjz1             = _mm_setzero_pd();
777             fjx2             = _mm_setzero_pd();
778             fjy2             = _mm_setzero_pd();
779             fjz2             = _mm_setzero_pd();
780             fjx3             = _mm_setzero_pd();
781             fjy3             = _mm_setzero_pd();
782             fjz3             = _mm_setzero_pd();
783
784             /**************************
785              * CALCULATE INTERACTIONS *
786              **************************/
787
788             r00              = _mm_mul_pd(rsq00,rinv00);
789
790             /* Calculate table index by multiplying r with table scale and truncate to integer */
791             rt               = _mm_mul_pd(r00,vftabscale);
792             vfitab           = _mm_cvttpd_epi32(rt);
793             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
794             vfitab           = _mm_slli_epi32(vfitab,3);
795
796             /* CUBIC SPLINE TABLE DISPERSION */
797             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
798             F                = _mm_setzero_pd();
799             GMX_MM_TRANSPOSE2_PD(Y,F);
800             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
801             H                = _mm_setzero_pd();
802             GMX_MM_TRANSPOSE2_PD(G,H);
803             Heps             = _mm_mul_pd(vfeps,H);
804             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
805             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
806             vvdw6            = _mm_mul_pd(c6_00,VV);
807             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
808             fvdw6            = _mm_mul_pd(c6_00,FF);
809
810             /* CUBIC SPLINE TABLE REPULSION */
811             vfitab           = _mm_add_epi32(vfitab,ifour);
812             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
813             F                = _mm_setzero_pd();
814             GMX_MM_TRANSPOSE2_PD(Y,F);
815             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
816             H                = _mm_setzero_pd();
817             GMX_MM_TRANSPOSE2_PD(G,H);
818             Heps             = _mm_mul_pd(vfeps,H);
819             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
820             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
821             vvdw12           = _mm_mul_pd(c12_00,VV);
822             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
823             fvdw12           = _mm_mul_pd(c12_00,FF);
824             vvdw             = _mm_add_pd(vvdw12,vvdw6);
825             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
826
827             /* Update potential sum for this i atom from the interaction with this j atom. */
828             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
829             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
830
831             fscal            = fvdw;
832
833             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
834
835             /* Calculate temporary vectorial force */
836             tx               = _mm_mul_pd(fscal,dx00);
837             ty               = _mm_mul_pd(fscal,dy00);
838             tz               = _mm_mul_pd(fscal,dz00);
839
840             /* Update vectorial force */
841             fix0             = _mm_add_pd(fix0,tx);
842             fiy0             = _mm_add_pd(fiy0,ty);
843             fiz0             = _mm_add_pd(fiz0,tz);
844
845             fjx0             = _mm_add_pd(fjx0,tx);
846             fjy0             = _mm_add_pd(fjy0,ty);
847             fjz0             = _mm_add_pd(fjz0,tz);
848
849             /**************************
850              * CALCULATE INTERACTIONS *
851              **************************/
852
853             if (gmx_mm_any_lt(rsq11,rcutoff2))
854             {
855
856             /* REACTION-FIELD ELECTROSTATICS */
857             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
858             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
859
860             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
861
862             /* Update potential sum for this i atom from the interaction with this j atom. */
863             velec            = _mm_and_pd(velec,cutoff_mask);
864             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
865             velecsum         = _mm_add_pd(velecsum,velec);
866
867             fscal            = felec;
868
869             fscal            = _mm_and_pd(fscal,cutoff_mask);
870
871             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
872
873             /* Calculate temporary vectorial force */
874             tx               = _mm_mul_pd(fscal,dx11);
875             ty               = _mm_mul_pd(fscal,dy11);
876             tz               = _mm_mul_pd(fscal,dz11);
877
878             /* Update vectorial force */
879             fix1             = _mm_add_pd(fix1,tx);
880             fiy1             = _mm_add_pd(fiy1,ty);
881             fiz1             = _mm_add_pd(fiz1,tz);
882
883             fjx1             = _mm_add_pd(fjx1,tx);
884             fjy1             = _mm_add_pd(fjy1,ty);
885             fjz1             = _mm_add_pd(fjz1,tz);
886
887             }
888
889             /**************************
890              * CALCULATE INTERACTIONS *
891              **************************/
892
893             if (gmx_mm_any_lt(rsq12,rcutoff2))
894             {
895
896             /* REACTION-FIELD ELECTROSTATICS */
897             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
898             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
899
900             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
901
902             /* Update potential sum for this i atom from the interaction with this j atom. */
903             velec            = _mm_and_pd(velec,cutoff_mask);
904             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
905             velecsum         = _mm_add_pd(velecsum,velec);
906
907             fscal            = felec;
908
909             fscal            = _mm_and_pd(fscal,cutoff_mask);
910
911             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
912
913             /* Calculate temporary vectorial force */
914             tx               = _mm_mul_pd(fscal,dx12);
915             ty               = _mm_mul_pd(fscal,dy12);
916             tz               = _mm_mul_pd(fscal,dz12);
917
918             /* Update vectorial force */
919             fix1             = _mm_add_pd(fix1,tx);
920             fiy1             = _mm_add_pd(fiy1,ty);
921             fiz1             = _mm_add_pd(fiz1,tz);
922
923             fjx2             = _mm_add_pd(fjx2,tx);
924             fjy2             = _mm_add_pd(fjy2,ty);
925             fjz2             = _mm_add_pd(fjz2,tz);
926
927             }
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             if (gmx_mm_any_lt(rsq13,rcutoff2))
934             {
935
936             /* REACTION-FIELD ELECTROSTATICS */
937             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
938             felec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
939
940             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
941
942             /* Update potential sum for this i atom from the interaction with this j atom. */
943             velec            = _mm_and_pd(velec,cutoff_mask);
944             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
945             velecsum         = _mm_add_pd(velecsum,velec);
946
947             fscal            = felec;
948
949             fscal            = _mm_and_pd(fscal,cutoff_mask);
950
951             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
952
953             /* Calculate temporary vectorial force */
954             tx               = _mm_mul_pd(fscal,dx13);
955             ty               = _mm_mul_pd(fscal,dy13);
956             tz               = _mm_mul_pd(fscal,dz13);
957
958             /* Update vectorial force */
959             fix1             = _mm_add_pd(fix1,tx);
960             fiy1             = _mm_add_pd(fiy1,ty);
961             fiz1             = _mm_add_pd(fiz1,tz);
962
963             fjx3             = _mm_add_pd(fjx3,tx);
964             fjy3             = _mm_add_pd(fjy3,ty);
965             fjz3             = _mm_add_pd(fjz3,tz);
966
967             }
968
969             /**************************
970              * CALCULATE INTERACTIONS *
971              **************************/
972
973             if (gmx_mm_any_lt(rsq21,rcutoff2))
974             {
975
976             /* REACTION-FIELD ELECTROSTATICS */
977             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
978             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
979
980             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
981
982             /* Update potential sum for this i atom from the interaction with this j atom. */
983             velec            = _mm_and_pd(velec,cutoff_mask);
984             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
985             velecsum         = _mm_add_pd(velecsum,velec);
986
987             fscal            = felec;
988
989             fscal            = _mm_and_pd(fscal,cutoff_mask);
990
991             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
992
993             /* Calculate temporary vectorial force */
994             tx               = _mm_mul_pd(fscal,dx21);
995             ty               = _mm_mul_pd(fscal,dy21);
996             tz               = _mm_mul_pd(fscal,dz21);
997
998             /* Update vectorial force */
999             fix2             = _mm_add_pd(fix2,tx);
1000             fiy2             = _mm_add_pd(fiy2,ty);
1001             fiz2             = _mm_add_pd(fiz2,tz);
1002
1003             fjx1             = _mm_add_pd(fjx1,tx);
1004             fjy1             = _mm_add_pd(fjy1,ty);
1005             fjz1             = _mm_add_pd(fjz1,tz);
1006
1007             }
1008
1009             /**************************
1010              * CALCULATE INTERACTIONS *
1011              **************************/
1012
1013             if (gmx_mm_any_lt(rsq22,rcutoff2))
1014             {
1015
1016             /* REACTION-FIELD ELECTROSTATICS */
1017             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1018             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1019
1020             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1021
1022             /* Update potential sum for this i atom from the interaction with this j atom. */
1023             velec            = _mm_and_pd(velec,cutoff_mask);
1024             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1025             velecsum         = _mm_add_pd(velecsum,velec);
1026
1027             fscal            = felec;
1028
1029             fscal            = _mm_and_pd(fscal,cutoff_mask);
1030
1031             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1032
1033             /* Calculate temporary vectorial force */
1034             tx               = _mm_mul_pd(fscal,dx22);
1035             ty               = _mm_mul_pd(fscal,dy22);
1036             tz               = _mm_mul_pd(fscal,dz22);
1037
1038             /* Update vectorial force */
1039             fix2             = _mm_add_pd(fix2,tx);
1040             fiy2             = _mm_add_pd(fiy2,ty);
1041             fiz2             = _mm_add_pd(fiz2,tz);
1042
1043             fjx2             = _mm_add_pd(fjx2,tx);
1044             fjy2             = _mm_add_pd(fjy2,ty);
1045             fjz2             = _mm_add_pd(fjz2,tz);
1046
1047             }
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             if (gmx_mm_any_lt(rsq23,rcutoff2))
1054             {
1055
1056             /* REACTION-FIELD ELECTROSTATICS */
1057             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
1058             felec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1059
1060             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1061
1062             /* Update potential sum for this i atom from the interaction with this j atom. */
1063             velec            = _mm_and_pd(velec,cutoff_mask);
1064             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1065             velecsum         = _mm_add_pd(velecsum,velec);
1066
1067             fscal            = felec;
1068
1069             fscal            = _mm_and_pd(fscal,cutoff_mask);
1070
1071             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1072
1073             /* Calculate temporary vectorial force */
1074             tx               = _mm_mul_pd(fscal,dx23);
1075             ty               = _mm_mul_pd(fscal,dy23);
1076             tz               = _mm_mul_pd(fscal,dz23);
1077
1078             /* Update vectorial force */
1079             fix2             = _mm_add_pd(fix2,tx);
1080             fiy2             = _mm_add_pd(fiy2,ty);
1081             fiz2             = _mm_add_pd(fiz2,tz);
1082
1083             fjx3             = _mm_add_pd(fjx3,tx);
1084             fjy3             = _mm_add_pd(fjy3,ty);
1085             fjz3             = _mm_add_pd(fjz3,tz);
1086
1087             }
1088
1089             /**************************
1090              * CALCULATE INTERACTIONS *
1091              **************************/
1092
1093             if (gmx_mm_any_lt(rsq31,rcutoff2))
1094             {
1095
1096             /* REACTION-FIELD ELECTROSTATICS */
1097             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
1098             felec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1099
1100             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1101
1102             /* Update potential sum for this i atom from the interaction with this j atom. */
1103             velec            = _mm_and_pd(velec,cutoff_mask);
1104             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1105             velecsum         = _mm_add_pd(velecsum,velec);
1106
1107             fscal            = felec;
1108
1109             fscal            = _mm_and_pd(fscal,cutoff_mask);
1110
1111             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1112
1113             /* Calculate temporary vectorial force */
1114             tx               = _mm_mul_pd(fscal,dx31);
1115             ty               = _mm_mul_pd(fscal,dy31);
1116             tz               = _mm_mul_pd(fscal,dz31);
1117
1118             /* Update vectorial force */
1119             fix3             = _mm_add_pd(fix3,tx);
1120             fiy3             = _mm_add_pd(fiy3,ty);
1121             fiz3             = _mm_add_pd(fiz3,tz);
1122
1123             fjx1             = _mm_add_pd(fjx1,tx);
1124             fjy1             = _mm_add_pd(fjy1,ty);
1125             fjz1             = _mm_add_pd(fjz1,tz);
1126
1127             }
1128
1129             /**************************
1130              * CALCULATE INTERACTIONS *
1131              **************************/
1132
1133             if (gmx_mm_any_lt(rsq32,rcutoff2))
1134             {
1135
1136             /* REACTION-FIELD ELECTROSTATICS */
1137             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
1138             felec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1139
1140             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1141
1142             /* Update potential sum for this i atom from the interaction with this j atom. */
1143             velec            = _mm_and_pd(velec,cutoff_mask);
1144             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1145             velecsum         = _mm_add_pd(velecsum,velec);
1146
1147             fscal            = felec;
1148
1149             fscal            = _mm_and_pd(fscal,cutoff_mask);
1150
1151             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1152
1153             /* Calculate temporary vectorial force */
1154             tx               = _mm_mul_pd(fscal,dx32);
1155             ty               = _mm_mul_pd(fscal,dy32);
1156             tz               = _mm_mul_pd(fscal,dz32);
1157
1158             /* Update vectorial force */
1159             fix3             = _mm_add_pd(fix3,tx);
1160             fiy3             = _mm_add_pd(fiy3,ty);
1161             fiz3             = _mm_add_pd(fiz3,tz);
1162
1163             fjx2             = _mm_add_pd(fjx2,tx);
1164             fjy2             = _mm_add_pd(fjy2,ty);
1165             fjz2             = _mm_add_pd(fjz2,tz);
1166
1167             }
1168
1169             /**************************
1170              * CALCULATE INTERACTIONS *
1171              **************************/
1172
1173             if (gmx_mm_any_lt(rsq33,rcutoff2))
1174             {
1175
1176             /* REACTION-FIELD ELECTROSTATICS */
1177             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
1178             felec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1179
1180             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1181
1182             /* Update potential sum for this i atom from the interaction with this j atom. */
1183             velec            = _mm_and_pd(velec,cutoff_mask);
1184             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1185             velecsum         = _mm_add_pd(velecsum,velec);
1186
1187             fscal            = felec;
1188
1189             fscal            = _mm_and_pd(fscal,cutoff_mask);
1190
1191             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1192
1193             /* Calculate temporary vectorial force */
1194             tx               = _mm_mul_pd(fscal,dx33);
1195             ty               = _mm_mul_pd(fscal,dy33);
1196             tz               = _mm_mul_pd(fscal,dz33);
1197
1198             /* Update vectorial force */
1199             fix3             = _mm_add_pd(fix3,tx);
1200             fiy3             = _mm_add_pd(fiy3,ty);
1201             fiz3             = _mm_add_pd(fiz3,tz);
1202
1203             fjx3             = _mm_add_pd(fjx3,tx);
1204             fjy3             = _mm_add_pd(fjy3,ty);
1205             fjz3             = _mm_add_pd(fjz3,tz);
1206
1207             }
1208
1209             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1210
1211             /* Inner loop uses 383 flops */
1212         }
1213
1214         /* End of innermost loop */
1215
1216         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1217                                               f+i_coord_offset,fshift+i_shift_offset);
1218
1219         ggid                        = gid[iidx];
1220         /* Update potential energies */
1221         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1222         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1223
1224         /* Increment number of inner iterations */
1225         inneriter                  += j_index_end - j_index_start;
1226
1227         /* Outer loop uses 26 flops */
1228     }
1229
1230     /* Increment number of outer iterations */
1231     outeriter        += nri;
1232
1233     /* Update outer/inner flops */
1234
1235     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*383);
1236 }
1237 /*
1238  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_double
1239  * Electrostatics interaction: ReactionField
1240  * VdW interaction:            CubicSplineTable
1241  * Geometry:                   Water4-Water4
1242  * Calculate force/pot:        Force
1243  */
1244 void
1245 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_double
1246                     (t_nblist * gmx_restrict                nlist,
1247                      rvec * gmx_restrict                    xx,
1248                      rvec * gmx_restrict                    ff,
1249                      t_forcerec * gmx_restrict              fr,
1250                      t_mdatoms * gmx_restrict               mdatoms,
1251                      nb_kernel_data_t * gmx_restrict        kernel_data,
1252                      t_nrnb * gmx_restrict                  nrnb)
1253 {
1254     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1255      * just 0 for non-waters.
1256      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1257      * jnr indices corresponding to data put in the four positions in the SIMD register.
1258      */
1259     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1260     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1261     int              jnrA,jnrB;
1262     int              j_coord_offsetA,j_coord_offsetB;
1263     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1264     real             rcutoff_scalar;
1265     real             *shiftvec,*fshift,*x,*f;
1266     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1267     int              vdwioffset0;
1268     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1269     int              vdwioffset1;
1270     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1271     int              vdwioffset2;
1272     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1273     int              vdwioffset3;
1274     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1275     int              vdwjidx0A,vdwjidx0B;
1276     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1277     int              vdwjidx1A,vdwjidx1B;
1278     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1279     int              vdwjidx2A,vdwjidx2B;
1280     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1281     int              vdwjidx3A,vdwjidx3B;
1282     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1283     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1284     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1285     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1286     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1287     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1288     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1289     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1290     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1291     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1292     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1293     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1294     real             *charge;
1295     int              nvdwtype;
1296     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1297     int              *vdwtype;
1298     real             *vdwparam;
1299     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1300     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1301     __m128i          vfitab;
1302     __m128i          ifour       = _mm_set1_epi32(4);
1303     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1304     real             *vftab;
1305     __m128d          dummy_mask,cutoff_mask;
1306     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1307     __m128d          one     = _mm_set1_pd(1.0);
1308     __m128d          two     = _mm_set1_pd(2.0);
1309     x                = xx[0];
1310     f                = ff[0];
1311
1312     nri              = nlist->nri;
1313     iinr             = nlist->iinr;
1314     jindex           = nlist->jindex;
1315     jjnr             = nlist->jjnr;
1316     shiftidx         = nlist->shift;
1317     gid              = nlist->gid;
1318     shiftvec         = fr->shift_vec[0];
1319     fshift           = fr->fshift[0];
1320     facel            = _mm_set1_pd(fr->epsfac);
1321     charge           = mdatoms->chargeA;
1322     krf              = _mm_set1_pd(fr->ic->k_rf);
1323     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
1324     crf              = _mm_set1_pd(fr->ic->c_rf);
1325     nvdwtype         = fr->ntype;
1326     vdwparam         = fr->nbfp;
1327     vdwtype          = mdatoms->typeA;
1328
1329     vftab            = kernel_data->table_vdw->data;
1330     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
1331
1332     /* Setup water-specific parameters */
1333     inr              = nlist->iinr[0];
1334     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1335     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1336     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1337     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1338
1339     jq1              = _mm_set1_pd(charge[inr+1]);
1340     jq2              = _mm_set1_pd(charge[inr+2]);
1341     jq3              = _mm_set1_pd(charge[inr+3]);
1342     vdwjidx0A        = 2*vdwtype[inr+0];
1343     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1344     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1345     qq11             = _mm_mul_pd(iq1,jq1);
1346     qq12             = _mm_mul_pd(iq1,jq2);
1347     qq13             = _mm_mul_pd(iq1,jq3);
1348     qq21             = _mm_mul_pd(iq2,jq1);
1349     qq22             = _mm_mul_pd(iq2,jq2);
1350     qq23             = _mm_mul_pd(iq2,jq3);
1351     qq31             = _mm_mul_pd(iq3,jq1);
1352     qq32             = _mm_mul_pd(iq3,jq2);
1353     qq33             = _mm_mul_pd(iq3,jq3);
1354
1355     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1356     rcutoff_scalar   = fr->rcoulomb;
1357     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1358     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1359
1360     /* Avoid stupid compiler warnings */
1361     jnrA = jnrB = 0;
1362     j_coord_offsetA = 0;
1363     j_coord_offsetB = 0;
1364
1365     outeriter        = 0;
1366     inneriter        = 0;
1367
1368     /* Start outer loop over neighborlists */
1369     for(iidx=0; iidx<nri; iidx++)
1370     {
1371         /* Load shift vector for this list */
1372         i_shift_offset   = DIM*shiftidx[iidx];
1373
1374         /* Load limits for loop over neighbors */
1375         j_index_start    = jindex[iidx];
1376         j_index_end      = jindex[iidx+1];
1377
1378         /* Get outer coordinate index */
1379         inr              = iinr[iidx];
1380         i_coord_offset   = DIM*inr;
1381
1382         /* Load i particle coords and add shift vector */
1383         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1384                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1385
1386         fix0             = _mm_setzero_pd();
1387         fiy0             = _mm_setzero_pd();
1388         fiz0             = _mm_setzero_pd();
1389         fix1             = _mm_setzero_pd();
1390         fiy1             = _mm_setzero_pd();
1391         fiz1             = _mm_setzero_pd();
1392         fix2             = _mm_setzero_pd();
1393         fiy2             = _mm_setzero_pd();
1394         fiz2             = _mm_setzero_pd();
1395         fix3             = _mm_setzero_pd();
1396         fiy3             = _mm_setzero_pd();
1397         fiz3             = _mm_setzero_pd();
1398
1399         /* Start inner kernel loop */
1400         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1401         {
1402
1403             /* Get j neighbor index, and coordinate index */
1404             jnrA             = jjnr[jidx];
1405             jnrB             = jjnr[jidx+1];
1406             j_coord_offsetA  = DIM*jnrA;
1407             j_coord_offsetB  = DIM*jnrB;
1408
1409             /* load j atom coordinates */
1410             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1411                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1412                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1413
1414             /* Calculate displacement vector */
1415             dx00             = _mm_sub_pd(ix0,jx0);
1416             dy00             = _mm_sub_pd(iy0,jy0);
1417             dz00             = _mm_sub_pd(iz0,jz0);
1418             dx11             = _mm_sub_pd(ix1,jx1);
1419             dy11             = _mm_sub_pd(iy1,jy1);
1420             dz11             = _mm_sub_pd(iz1,jz1);
1421             dx12             = _mm_sub_pd(ix1,jx2);
1422             dy12             = _mm_sub_pd(iy1,jy2);
1423             dz12             = _mm_sub_pd(iz1,jz2);
1424             dx13             = _mm_sub_pd(ix1,jx3);
1425             dy13             = _mm_sub_pd(iy1,jy3);
1426             dz13             = _mm_sub_pd(iz1,jz3);
1427             dx21             = _mm_sub_pd(ix2,jx1);
1428             dy21             = _mm_sub_pd(iy2,jy1);
1429             dz21             = _mm_sub_pd(iz2,jz1);
1430             dx22             = _mm_sub_pd(ix2,jx2);
1431             dy22             = _mm_sub_pd(iy2,jy2);
1432             dz22             = _mm_sub_pd(iz2,jz2);
1433             dx23             = _mm_sub_pd(ix2,jx3);
1434             dy23             = _mm_sub_pd(iy2,jy3);
1435             dz23             = _mm_sub_pd(iz2,jz3);
1436             dx31             = _mm_sub_pd(ix3,jx1);
1437             dy31             = _mm_sub_pd(iy3,jy1);
1438             dz31             = _mm_sub_pd(iz3,jz1);
1439             dx32             = _mm_sub_pd(ix3,jx2);
1440             dy32             = _mm_sub_pd(iy3,jy2);
1441             dz32             = _mm_sub_pd(iz3,jz2);
1442             dx33             = _mm_sub_pd(ix3,jx3);
1443             dy33             = _mm_sub_pd(iy3,jy3);
1444             dz33             = _mm_sub_pd(iz3,jz3);
1445
1446             /* Calculate squared distance and things based on it */
1447             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1448             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1449             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1450             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1451             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1452             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1453             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1454             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1455             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1456             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1457
1458             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1459             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1460             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1461             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1462             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1463             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1464             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1465             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1466             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1467             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1468
1469             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1470             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1471             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1472             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1473             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1474             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1475             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1476             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1477             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1478
1479             fjx0             = _mm_setzero_pd();
1480             fjy0             = _mm_setzero_pd();
1481             fjz0             = _mm_setzero_pd();
1482             fjx1             = _mm_setzero_pd();
1483             fjy1             = _mm_setzero_pd();
1484             fjz1             = _mm_setzero_pd();
1485             fjx2             = _mm_setzero_pd();
1486             fjy2             = _mm_setzero_pd();
1487             fjz2             = _mm_setzero_pd();
1488             fjx3             = _mm_setzero_pd();
1489             fjy3             = _mm_setzero_pd();
1490             fjz3             = _mm_setzero_pd();
1491
1492             /**************************
1493              * CALCULATE INTERACTIONS *
1494              **************************/
1495
1496             r00              = _mm_mul_pd(rsq00,rinv00);
1497
1498             /* Calculate table index by multiplying r with table scale and truncate to integer */
1499             rt               = _mm_mul_pd(r00,vftabscale);
1500             vfitab           = _mm_cvttpd_epi32(rt);
1501             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1502             vfitab           = _mm_slli_epi32(vfitab,3);
1503
1504             /* CUBIC SPLINE TABLE DISPERSION */
1505             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1506             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1507             GMX_MM_TRANSPOSE2_PD(Y,F);
1508             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1509             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1510             GMX_MM_TRANSPOSE2_PD(G,H);
1511             Heps             = _mm_mul_pd(vfeps,H);
1512             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1513             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1514             fvdw6            = _mm_mul_pd(c6_00,FF);
1515
1516             /* CUBIC SPLINE TABLE REPULSION */
1517             vfitab           = _mm_add_epi32(vfitab,ifour);
1518             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1519             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1520             GMX_MM_TRANSPOSE2_PD(Y,F);
1521             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1522             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1523             GMX_MM_TRANSPOSE2_PD(G,H);
1524             Heps             = _mm_mul_pd(vfeps,H);
1525             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1526             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1527             fvdw12           = _mm_mul_pd(c12_00,FF);
1528             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1529
1530             fscal            = fvdw;
1531
1532             /* Calculate temporary vectorial force */
1533             tx               = _mm_mul_pd(fscal,dx00);
1534             ty               = _mm_mul_pd(fscal,dy00);
1535             tz               = _mm_mul_pd(fscal,dz00);
1536
1537             /* Update vectorial force */
1538             fix0             = _mm_add_pd(fix0,tx);
1539             fiy0             = _mm_add_pd(fiy0,ty);
1540             fiz0             = _mm_add_pd(fiz0,tz);
1541
1542             fjx0             = _mm_add_pd(fjx0,tx);
1543             fjy0             = _mm_add_pd(fjy0,ty);
1544             fjz0             = _mm_add_pd(fjz0,tz);
1545
1546             /**************************
1547              * CALCULATE INTERACTIONS *
1548              **************************/
1549
1550             if (gmx_mm_any_lt(rsq11,rcutoff2))
1551             {
1552
1553             /* REACTION-FIELD ELECTROSTATICS */
1554             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1555
1556             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1557
1558             fscal            = felec;
1559
1560             fscal            = _mm_and_pd(fscal,cutoff_mask);
1561
1562             /* Calculate temporary vectorial force */
1563             tx               = _mm_mul_pd(fscal,dx11);
1564             ty               = _mm_mul_pd(fscal,dy11);
1565             tz               = _mm_mul_pd(fscal,dz11);
1566
1567             /* Update vectorial force */
1568             fix1             = _mm_add_pd(fix1,tx);
1569             fiy1             = _mm_add_pd(fiy1,ty);
1570             fiz1             = _mm_add_pd(fiz1,tz);
1571
1572             fjx1             = _mm_add_pd(fjx1,tx);
1573             fjy1             = _mm_add_pd(fjy1,ty);
1574             fjz1             = _mm_add_pd(fjz1,tz);
1575
1576             }
1577
1578             /**************************
1579              * CALCULATE INTERACTIONS *
1580              **************************/
1581
1582             if (gmx_mm_any_lt(rsq12,rcutoff2))
1583             {
1584
1585             /* REACTION-FIELD ELECTROSTATICS */
1586             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1587
1588             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1589
1590             fscal            = felec;
1591
1592             fscal            = _mm_and_pd(fscal,cutoff_mask);
1593
1594             /* Calculate temporary vectorial force */
1595             tx               = _mm_mul_pd(fscal,dx12);
1596             ty               = _mm_mul_pd(fscal,dy12);
1597             tz               = _mm_mul_pd(fscal,dz12);
1598
1599             /* Update vectorial force */
1600             fix1             = _mm_add_pd(fix1,tx);
1601             fiy1             = _mm_add_pd(fiy1,ty);
1602             fiz1             = _mm_add_pd(fiz1,tz);
1603
1604             fjx2             = _mm_add_pd(fjx2,tx);
1605             fjy2             = _mm_add_pd(fjy2,ty);
1606             fjz2             = _mm_add_pd(fjz2,tz);
1607
1608             }
1609
1610             /**************************
1611              * CALCULATE INTERACTIONS *
1612              **************************/
1613
1614             if (gmx_mm_any_lt(rsq13,rcutoff2))
1615             {
1616
1617             /* REACTION-FIELD ELECTROSTATICS */
1618             felec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1619
1620             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
1621
1622             fscal            = felec;
1623
1624             fscal            = _mm_and_pd(fscal,cutoff_mask);
1625
1626             /* Calculate temporary vectorial force */
1627             tx               = _mm_mul_pd(fscal,dx13);
1628             ty               = _mm_mul_pd(fscal,dy13);
1629             tz               = _mm_mul_pd(fscal,dz13);
1630
1631             /* Update vectorial force */
1632             fix1             = _mm_add_pd(fix1,tx);
1633             fiy1             = _mm_add_pd(fiy1,ty);
1634             fiz1             = _mm_add_pd(fiz1,tz);
1635
1636             fjx3             = _mm_add_pd(fjx3,tx);
1637             fjy3             = _mm_add_pd(fjy3,ty);
1638             fjz3             = _mm_add_pd(fjz3,tz);
1639
1640             }
1641
1642             /**************************
1643              * CALCULATE INTERACTIONS *
1644              **************************/
1645
1646             if (gmx_mm_any_lt(rsq21,rcutoff2))
1647             {
1648
1649             /* REACTION-FIELD ELECTROSTATICS */
1650             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1651
1652             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1653
1654             fscal            = felec;
1655
1656             fscal            = _mm_and_pd(fscal,cutoff_mask);
1657
1658             /* Calculate temporary vectorial force */
1659             tx               = _mm_mul_pd(fscal,dx21);
1660             ty               = _mm_mul_pd(fscal,dy21);
1661             tz               = _mm_mul_pd(fscal,dz21);
1662
1663             /* Update vectorial force */
1664             fix2             = _mm_add_pd(fix2,tx);
1665             fiy2             = _mm_add_pd(fiy2,ty);
1666             fiz2             = _mm_add_pd(fiz2,tz);
1667
1668             fjx1             = _mm_add_pd(fjx1,tx);
1669             fjy1             = _mm_add_pd(fjy1,ty);
1670             fjz1             = _mm_add_pd(fjz1,tz);
1671
1672             }
1673
1674             /**************************
1675              * CALCULATE INTERACTIONS *
1676              **************************/
1677
1678             if (gmx_mm_any_lt(rsq22,rcutoff2))
1679             {
1680
1681             /* REACTION-FIELD ELECTROSTATICS */
1682             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1683
1684             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1685
1686             fscal            = felec;
1687
1688             fscal            = _mm_and_pd(fscal,cutoff_mask);
1689
1690             /* Calculate temporary vectorial force */
1691             tx               = _mm_mul_pd(fscal,dx22);
1692             ty               = _mm_mul_pd(fscal,dy22);
1693             tz               = _mm_mul_pd(fscal,dz22);
1694
1695             /* Update vectorial force */
1696             fix2             = _mm_add_pd(fix2,tx);
1697             fiy2             = _mm_add_pd(fiy2,ty);
1698             fiz2             = _mm_add_pd(fiz2,tz);
1699
1700             fjx2             = _mm_add_pd(fjx2,tx);
1701             fjy2             = _mm_add_pd(fjy2,ty);
1702             fjz2             = _mm_add_pd(fjz2,tz);
1703
1704             }
1705
1706             /**************************
1707              * CALCULATE INTERACTIONS *
1708              **************************/
1709
1710             if (gmx_mm_any_lt(rsq23,rcutoff2))
1711             {
1712
1713             /* REACTION-FIELD ELECTROSTATICS */
1714             felec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1715
1716             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
1717
1718             fscal            = felec;
1719
1720             fscal            = _mm_and_pd(fscal,cutoff_mask);
1721
1722             /* Calculate temporary vectorial force */
1723             tx               = _mm_mul_pd(fscal,dx23);
1724             ty               = _mm_mul_pd(fscal,dy23);
1725             tz               = _mm_mul_pd(fscal,dz23);
1726
1727             /* Update vectorial force */
1728             fix2             = _mm_add_pd(fix2,tx);
1729             fiy2             = _mm_add_pd(fiy2,ty);
1730             fiz2             = _mm_add_pd(fiz2,tz);
1731
1732             fjx3             = _mm_add_pd(fjx3,tx);
1733             fjy3             = _mm_add_pd(fjy3,ty);
1734             fjz3             = _mm_add_pd(fjz3,tz);
1735
1736             }
1737
1738             /**************************
1739              * CALCULATE INTERACTIONS *
1740              **************************/
1741
1742             if (gmx_mm_any_lt(rsq31,rcutoff2))
1743             {
1744
1745             /* REACTION-FIELD ELECTROSTATICS */
1746             felec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1747
1748             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
1749
1750             fscal            = felec;
1751
1752             fscal            = _mm_and_pd(fscal,cutoff_mask);
1753
1754             /* Calculate temporary vectorial force */
1755             tx               = _mm_mul_pd(fscal,dx31);
1756             ty               = _mm_mul_pd(fscal,dy31);
1757             tz               = _mm_mul_pd(fscal,dz31);
1758
1759             /* Update vectorial force */
1760             fix3             = _mm_add_pd(fix3,tx);
1761             fiy3             = _mm_add_pd(fiy3,ty);
1762             fiz3             = _mm_add_pd(fiz3,tz);
1763
1764             fjx1             = _mm_add_pd(fjx1,tx);
1765             fjy1             = _mm_add_pd(fjy1,ty);
1766             fjz1             = _mm_add_pd(fjz1,tz);
1767
1768             }
1769
1770             /**************************
1771              * CALCULATE INTERACTIONS *
1772              **************************/
1773
1774             if (gmx_mm_any_lt(rsq32,rcutoff2))
1775             {
1776
1777             /* REACTION-FIELD ELECTROSTATICS */
1778             felec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1779
1780             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
1781
1782             fscal            = felec;
1783
1784             fscal            = _mm_and_pd(fscal,cutoff_mask);
1785
1786             /* Calculate temporary vectorial force */
1787             tx               = _mm_mul_pd(fscal,dx32);
1788             ty               = _mm_mul_pd(fscal,dy32);
1789             tz               = _mm_mul_pd(fscal,dz32);
1790
1791             /* Update vectorial force */
1792             fix3             = _mm_add_pd(fix3,tx);
1793             fiy3             = _mm_add_pd(fiy3,ty);
1794             fiz3             = _mm_add_pd(fiz3,tz);
1795
1796             fjx2             = _mm_add_pd(fjx2,tx);
1797             fjy2             = _mm_add_pd(fjy2,ty);
1798             fjz2             = _mm_add_pd(fjz2,tz);
1799
1800             }
1801
1802             /**************************
1803              * CALCULATE INTERACTIONS *
1804              **************************/
1805
1806             if (gmx_mm_any_lt(rsq33,rcutoff2))
1807             {
1808
1809             /* REACTION-FIELD ELECTROSTATICS */
1810             felec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1811
1812             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
1813
1814             fscal            = felec;
1815
1816             fscal            = _mm_and_pd(fscal,cutoff_mask);
1817
1818             /* Calculate temporary vectorial force */
1819             tx               = _mm_mul_pd(fscal,dx33);
1820             ty               = _mm_mul_pd(fscal,dy33);
1821             tz               = _mm_mul_pd(fscal,dz33);
1822
1823             /* Update vectorial force */
1824             fix3             = _mm_add_pd(fix3,tx);
1825             fiy3             = _mm_add_pd(fiy3,ty);
1826             fiz3             = _mm_add_pd(fiz3,tz);
1827
1828             fjx3             = _mm_add_pd(fjx3,tx);
1829             fjy3             = _mm_add_pd(fjy3,ty);
1830             fjz3             = _mm_add_pd(fjz3,tz);
1831
1832             }
1833
1834             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1835
1836             /* Inner loop uses 321 flops */
1837         }
1838
1839         if(jidx<j_index_end)
1840         {
1841
1842             jnrA             = jjnr[jidx];
1843             j_coord_offsetA  = DIM*jnrA;
1844
1845             /* load j atom coordinates */
1846             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1847                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1848                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1849
1850             /* Calculate displacement vector */
1851             dx00             = _mm_sub_pd(ix0,jx0);
1852             dy00             = _mm_sub_pd(iy0,jy0);
1853             dz00             = _mm_sub_pd(iz0,jz0);
1854             dx11             = _mm_sub_pd(ix1,jx1);
1855             dy11             = _mm_sub_pd(iy1,jy1);
1856             dz11             = _mm_sub_pd(iz1,jz1);
1857             dx12             = _mm_sub_pd(ix1,jx2);
1858             dy12             = _mm_sub_pd(iy1,jy2);
1859             dz12             = _mm_sub_pd(iz1,jz2);
1860             dx13             = _mm_sub_pd(ix1,jx3);
1861             dy13             = _mm_sub_pd(iy1,jy3);
1862             dz13             = _mm_sub_pd(iz1,jz3);
1863             dx21             = _mm_sub_pd(ix2,jx1);
1864             dy21             = _mm_sub_pd(iy2,jy1);
1865             dz21             = _mm_sub_pd(iz2,jz1);
1866             dx22             = _mm_sub_pd(ix2,jx2);
1867             dy22             = _mm_sub_pd(iy2,jy2);
1868             dz22             = _mm_sub_pd(iz2,jz2);
1869             dx23             = _mm_sub_pd(ix2,jx3);
1870             dy23             = _mm_sub_pd(iy2,jy3);
1871             dz23             = _mm_sub_pd(iz2,jz3);
1872             dx31             = _mm_sub_pd(ix3,jx1);
1873             dy31             = _mm_sub_pd(iy3,jy1);
1874             dz31             = _mm_sub_pd(iz3,jz1);
1875             dx32             = _mm_sub_pd(ix3,jx2);
1876             dy32             = _mm_sub_pd(iy3,jy2);
1877             dz32             = _mm_sub_pd(iz3,jz2);
1878             dx33             = _mm_sub_pd(ix3,jx3);
1879             dy33             = _mm_sub_pd(iy3,jy3);
1880             dz33             = _mm_sub_pd(iz3,jz3);
1881
1882             /* Calculate squared distance and things based on it */
1883             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1884             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1885             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1886             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1887             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1888             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1889             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1890             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1891             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1892             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1893
1894             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1895             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1896             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1897             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1898             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1899             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1900             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1901             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1902             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1903             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1904
1905             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1906             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1907             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1908             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1909             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1910             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1911             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1912             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1913             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1914
1915             fjx0             = _mm_setzero_pd();
1916             fjy0             = _mm_setzero_pd();
1917             fjz0             = _mm_setzero_pd();
1918             fjx1             = _mm_setzero_pd();
1919             fjy1             = _mm_setzero_pd();
1920             fjz1             = _mm_setzero_pd();
1921             fjx2             = _mm_setzero_pd();
1922             fjy2             = _mm_setzero_pd();
1923             fjz2             = _mm_setzero_pd();
1924             fjx3             = _mm_setzero_pd();
1925             fjy3             = _mm_setzero_pd();
1926             fjz3             = _mm_setzero_pd();
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             r00              = _mm_mul_pd(rsq00,rinv00);
1933
1934             /* Calculate table index by multiplying r with table scale and truncate to integer */
1935             rt               = _mm_mul_pd(r00,vftabscale);
1936             vfitab           = _mm_cvttpd_epi32(rt);
1937             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1938             vfitab           = _mm_slli_epi32(vfitab,3);
1939
1940             /* CUBIC SPLINE TABLE DISPERSION */
1941             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1942             F                = _mm_setzero_pd();
1943             GMX_MM_TRANSPOSE2_PD(Y,F);
1944             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1945             H                = _mm_setzero_pd();
1946             GMX_MM_TRANSPOSE2_PD(G,H);
1947             Heps             = _mm_mul_pd(vfeps,H);
1948             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1949             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1950             fvdw6            = _mm_mul_pd(c6_00,FF);
1951
1952             /* CUBIC SPLINE TABLE REPULSION */
1953             vfitab           = _mm_add_epi32(vfitab,ifour);
1954             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1955             F                = _mm_setzero_pd();
1956             GMX_MM_TRANSPOSE2_PD(Y,F);
1957             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1958             H                = _mm_setzero_pd();
1959             GMX_MM_TRANSPOSE2_PD(G,H);
1960             Heps             = _mm_mul_pd(vfeps,H);
1961             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1962             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1963             fvdw12           = _mm_mul_pd(c12_00,FF);
1964             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1965
1966             fscal            = fvdw;
1967
1968             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1969
1970             /* Calculate temporary vectorial force */
1971             tx               = _mm_mul_pd(fscal,dx00);
1972             ty               = _mm_mul_pd(fscal,dy00);
1973             tz               = _mm_mul_pd(fscal,dz00);
1974
1975             /* Update vectorial force */
1976             fix0             = _mm_add_pd(fix0,tx);
1977             fiy0             = _mm_add_pd(fiy0,ty);
1978             fiz0             = _mm_add_pd(fiz0,tz);
1979
1980             fjx0             = _mm_add_pd(fjx0,tx);
1981             fjy0             = _mm_add_pd(fjy0,ty);
1982             fjz0             = _mm_add_pd(fjz0,tz);
1983
1984             /**************************
1985              * CALCULATE INTERACTIONS *
1986              **************************/
1987
1988             if (gmx_mm_any_lt(rsq11,rcutoff2))
1989             {
1990
1991             /* REACTION-FIELD ELECTROSTATICS */
1992             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1993
1994             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1995
1996             fscal            = felec;
1997
1998             fscal            = _mm_and_pd(fscal,cutoff_mask);
1999
2000             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2001
2002             /* Calculate temporary vectorial force */
2003             tx               = _mm_mul_pd(fscal,dx11);
2004             ty               = _mm_mul_pd(fscal,dy11);
2005             tz               = _mm_mul_pd(fscal,dz11);
2006
2007             /* Update vectorial force */
2008             fix1             = _mm_add_pd(fix1,tx);
2009             fiy1             = _mm_add_pd(fiy1,ty);
2010             fiz1             = _mm_add_pd(fiz1,tz);
2011
2012             fjx1             = _mm_add_pd(fjx1,tx);
2013             fjy1             = _mm_add_pd(fjy1,ty);
2014             fjz1             = _mm_add_pd(fjz1,tz);
2015
2016             }
2017
2018             /**************************
2019              * CALCULATE INTERACTIONS *
2020              **************************/
2021
2022             if (gmx_mm_any_lt(rsq12,rcutoff2))
2023             {
2024
2025             /* REACTION-FIELD ELECTROSTATICS */
2026             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
2027
2028             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2029
2030             fscal            = felec;
2031
2032             fscal            = _mm_and_pd(fscal,cutoff_mask);
2033
2034             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2035
2036             /* Calculate temporary vectorial force */
2037             tx               = _mm_mul_pd(fscal,dx12);
2038             ty               = _mm_mul_pd(fscal,dy12);
2039             tz               = _mm_mul_pd(fscal,dz12);
2040
2041             /* Update vectorial force */
2042             fix1             = _mm_add_pd(fix1,tx);
2043             fiy1             = _mm_add_pd(fiy1,ty);
2044             fiz1             = _mm_add_pd(fiz1,tz);
2045
2046             fjx2             = _mm_add_pd(fjx2,tx);
2047             fjy2             = _mm_add_pd(fjy2,ty);
2048             fjz2             = _mm_add_pd(fjz2,tz);
2049
2050             }
2051
2052             /**************************
2053              * CALCULATE INTERACTIONS *
2054              **************************/
2055
2056             if (gmx_mm_any_lt(rsq13,rcutoff2))
2057             {
2058
2059             /* REACTION-FIELD ELECTROSTATICS */
2060             felec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
2061
2062             cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
2063
2064             fscal            = felec;
2065
2066             fscal            = _mm_and_pd(fscal,cutoff_mask);
2067
2068             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2069
2070             /* Calculate temporary vectorial force */
2071             tx               = _mm_mul_pd(fscal,dx13);
2072             ty               = _mm_mul_pd(fscal,dy13);
2073             tz               = _mm_mul_pd(fscal,dz13);
2074
2075             /* Update vectorial force */
2076             fix1             = _mm_add_pd(fix1,tx);
2077             fiy1             = _mm_add_pd(fiy1,ty);
2078             fiz1             = _mm_add_pd(fiz1,tz);
2079
2080             fjx3             = _mm_add_pd(fjx3,tx);
2081             fjy3             = _mm_add_pd(fjy3,ty);
2082             fjz3             = _mm_add_pd(fjz3,tz);
2083
2084             }
2085
2086             /**************************
2087              * CALCULATE INTERACTIONS *
2088              **************************/
2089
2090             if (gmx_mm_any_lt(rsq21,rcutoff2))
2091             {
2092
2093             /* REACTION-FIELD ELECTROSTATICS */
2094             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
2095
2096             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2097
2098             fscal            = felec;
2099
2100             fscal            = _mm_and_pd(fscal,cutoff_mask);
2101
2102             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2103
2104             /* Calculate temporary vectorial force */
2105             tx               = _mm_mul_pd(fscal,dx21);
2106             ty               = _mm_mul_pd(fscal,dy21);
2107             tz               = _mm_mul_pd(fscal,dz21);
2108
2109             /* Update vectorial force */
2110             fix2             = _mm_add_pd(fix2,tx);
2111             fiy2             = _mm_add_pd(fiy2,ty);
2112             fiz2             = _mm_add_pd(fiz2,tz);
2113
2114             fjx1             = _mm_add_pd(fjx1,tx);
2115             fjy1             = _mm_add_pd(fjy1,ty);
2116             fjz1             = _mm_add_pd(fjz1,tz);
2117
2118             }
2119
2120             /**************************
2121              * CALCULATE INTERACTIONS *
2122              **************************/
2123
2124             if (gmx_mm_any_lt(rsq22,rcutoff2))
2125             {
2126
2127             /* REACTION-FIELD ELECTROSTATICS */
2128             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2129
2130             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2131
2132             fscal            = felec;
2133
2134             fscal            = _mm_and_pd(fscal,cutoff_mask);
2135
2136             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2137
2138             /* Calculate temporary vectorial force */
2139             tx               = _mm_mul_pd(fscal,dx22);
2140             ty               = _mm_mul_pd(fscal,dy22);
2141             tz               = _mm_mul_pd(fscal,dz22);
2142
2143             /* Update vectorial force */
2144             fix2             = _mm_add_pd(fix2,tx);
2145             fiy2             = _mm_add_pd(fiy2,ty);
2146             fiz2             = _mm_add_pd(fiz2,tz);
2147
2148             fjx2             = _mm_add_pd(fjx2,tx);
2149             fjy2             = _mm_add_pd(fjy2,ty);
2150             fjz2             = _mm_add_pd(fjz2,tz);
2151
2152             }
2153
2154             /**************************
2155              * CALCULATE INTERACTIONS *
2156              **************************/
2157
2158             if (gmx_mm_any_lt(rsq23,rcutoff2))
2159             {
2160
2161             /* REACTION-FIELD ELECTROSTATICS */
2162             felec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
2163
2164             cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
2165
2166             fscal            = felec;
2167
2168             fscal            = _mm_and_pd(fscal,cutoff_mask);
2169
2170             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2171
2172             /* Calculate temporary vectorial force */
2173             tx               = _mm_mul_pd(fscal,dx23);
2174             ty               = _mm_mul_pd(fscal,dy23);
2175             tz               = _mm_mul_pd(fscal,dz23);
2176
2177             /* Update vectorial force */
2178             fix2             = _mm_add_pd(fix2,tx);
2179             fiy2             = _mm_add_pd(fiy2,ty);
2180             fiz2             = _mm_add_pd(fiz2,tz);
2181
2182             fjx3             = _mm_add_pd(fjx3,tx);
2183             fjy3             = _mm_add_pd(fjy3,ty);
2184             fjz3             = _mm_add_pd(fjz3,tz);
2185
2186             }
2187
2188             /**************************
2189              * CALCULATE INTERACTIONS *
2190              **************************/
2191
2192             if (gmx_mm_any_lt(rsq31,rcutoff2))
2193             {
2194
2195             /* REACTION-FIELD ELECTROSTATICS */
2196             felec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
2197
2198             cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
2199
2200             fscal            = felec;
2201
2202             fscal            = _mm_and_pd(fscal,cutoff_mask);
2203
2204             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2205
2206             /* Calculate temporary vectorial force */
2207             tx               = _mm_mul_pd(fscal,dx31);
2208             ty               = _mm_mul_pd(fscal,dy31);
2209             tz               = _mm_mul_pd(fscal,dz31);
2210
2211             /* Update vectorial force */
2212             fix3             = _mm_add_pd(fix3,tx);
2213             fiy3             = _mm_add_pd(fiy3,ty);
2214             fiz3             = _mm_add_pd(fiz3,tz);
2215
2216             fjx1             = _mm_add_pd(fjx1,tx);
2217             fjy1             = _mm_add_pd(fjy1,ty);
2218             fjz1             = _mm_add_pd(fjz1,tz);
2219
2220             }
2221
2222             /**************************
2223              * CALCULATE INTERACTIONS *
2224              **************************/
2225
2226             if (gmx_mm_any_lt(rsq32,rcutoff2))
2227             {
2228
2229             /* REACTION-FIELD ELECTROSTATICS */
2230             felec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
2231
2232             cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
2233
2234             fscal            = felec;
2235
2236             fscal            = _mm_and_pd(fscal,cutoff_mask);
2237
2238             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2239
2240             /* Calculate temporary vectorial force */
2241             tx               = _mm_mul_pd(fscal,dx32);
2242             ty               = _mm_mul_pd(fscal,dy32);
2243             tz               = _mm_mul_pd(fscal,dz32);
2244
2245             /* Update vectorial force */
2246             fix3             = _mm_add_pd(fix3,tx);
2247             fiy3             = _mm_add_pd(fiy3,ty);
2248             fiz3             = _mm_add_pd(fiz3,tz);
2249
2250             fjx2             = _mm_add_pd(fjx2,tx);
2251             fjy2             = _mm_add_pd(fjy2,ty);
2252             fjz2             = _mm_add_pd(fjz2,tz);
2253
2254             }
2255
2256             /**************************
2257              * CALCULATE INTERACTIONS *
2258              **************************/
2259
2260             if (gmx_mm_any_lt(rsq33,rcutoff2))
2261             {
2262
2263             /* REACTION-FIELD ELECTROSTATICS */
2264             felec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
2265
2266             cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
2267
2268             fscal            = felec;
2269
2270             fscal            = _mm_and_pd(fscal,cutoff_mask);
2271
2272             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2273
2274             /* Calculate temporary vectorial force */
2275             tx               = _mm_mul_pd(fscal,dx33);
2276             ty               = _mm_mul_pd(fscal,dy33);
2277             tz               = _mm_mul_pd(fscal,dz33);
2278
2279             /* Update vectorial force */
2280             fix3             = _mm_add_pd(fix3,tx);
2281             fiy3             = _mm_add_pd(fiy3,ty);
2282             fiz3             = _mm_add_pd(fiz3,tz);
2283
2284             fjx3             = _mm_add_pd(fjx3,tx);
2285             fjy3             = _mm_add_pd(fjy3,ty);
2286             fjz3             = _mm_add_pd(fjz3,tz);
2287
2288             }
2289
2290             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2291
2292             /* Inner loop uses 321 flops */
2293         }
2294
2295         /* End of innermost loop */
2296
2297         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2298                                               f+i_coord_offset,fshift+i_shift_offset);
2299
2300         /* Increment number of inner iterations */
2301         inneriter                  += j_index_end - j_index_start;
2302
2303         /* Outer loop uses 24 flops */
2304     }
2305
2306     /* Increment number of outer iterations */
2307     outeriter        += nri;
2308
2309     /* Update outer/inner flops */
2310
2311     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*321);
2312 }