c70a7359ff92247544319310d8c8a86ee71b5865
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse2_double.h"
48 #include "kernelutil_x86_sse2_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB;
75     int              j_coord_offsetA,j_coord_offsetB;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80     int              vdwioffset0;
81     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82     int              vdwioffset1;
83     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwjidx0A,vdwjidx0B;
87     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88     int              vdwjidx1A,vdwjidx1B;
89     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90     int              vdwjidx2A,vdwjidx2B;
91     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
102     real             *charge;
103     int              nvdwtype;
104     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
105     int              *vdwtype;
106     real             *vdwparam;
107     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
108     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
109     __m128i          vfitab;
110     __m128i          ifour       = _mm_set1_epi32(4);
111     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
112     real             *vftab;
113     __m128d          dummy_mask,cutoff_mask;
114     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115     __m128d          one     = _mm_set1_pd(1.0);
116     __m128d          two     = _mm_set1_pd(2.0);
117     x                = xx[0];
118     f                = ff[0];
119
120     nri              = nlist->nri;
121     iinr             = nlist->iinr;
122     jindex           = nlist->jindex;
123     jjnr             = nlist->jjnr;
124     shiftidx         = nlist->shift;
125     gid              = nlist->gid;
126     shiftvec         = fr->shift_vec[0];
127     fshift           = fr->fshift[0];
128     facel            = _mm_set1_pd(fr->epsfac);
129     charge           = mdatoms->chargeA;
130     krf              = _mm_set1_pd(fr->ic->k_rf);
131     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
132     crf              = _mm_set1_pd(fr->ic->c_rf);
133     nvdwtype         = fr->ntype;
134     vdwparam         = fr->nbfp;
135     vdwtype          = mdatoms->typeA;
136
137     vftab            = kernel_data->table_vdw->data;
138     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
139
140     /* Setup water-specific parameters */
141     inr              = nlist->iinr[0];
142     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
143     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
146
147     jq0              = _mm_set1_pd(charge[inr+0]);
148     jq1              = _mm_set1_pd(charge[inr+1]);
149     jq2              = _mm_set1_pd(charge[inr+2]);
150     vdwjidx0A        = 2*vdwtype[inr+0];
151     qq00             = _mm_mul_pd(iq0,jq0);
152     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154     qq01             = _mm_mul_pd(iq0,jq1);
155     qq02             = _mm_mul_pd(iq0,jq2);
156     qq10             = _mm_mul_pd(iq1,jq0);
157     qq11             = _mm_mul_pd(iq1,jq1);
158     qq12             = _mm_mul_pd(iq1,jq2);
159     qq20             = _mm_mul_pd(iq2,jq0);
160     qq21             = _mm_mul_pd(iq2,jq1);
161     qq22             = _mm_mul_pd(iq2,jq2);
162
163     /* Avoid stupid compiler warnings */
164     jnrA = jnrB = 0;
165     j_coord_offsetA = 0;
166     j_coord_offsetB = 0;
167
168     outeriter        = 0;
169     inneriter        = 0;
170
171     /* Start outer loop over neighborlists */
172     for(iidx=0; iidx<nri; iidx++)
173     {
174         /* Load shift vector for this list */
175         i_shift_offset   = DIM*shiftidx[iidx];
176
177         /* Load limits for loop over neighbors */
178         j_index_start    = jindex[iidx];
179         j_index_end      = jindex[iidx+1];
180
181         /* Get outer coordinate index */
182         inr              = iinr[iidx];
183         i_coord_offset   = DIM*inr;
184
185         /* Load i particle coords and add shift vector */
186         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
187                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
188
189         fix0             = _mm_setzero_pd();
190         fiy0             = _mm_setzero_pd();
191         fiz0             = _mm_setzero_pd();
192         fix1             = _mm_setzero_pd();
193         fiy1             = _mm_setzero_pd();
194         fiz1             = _mm_setzero_pd();
195         fix2             = _mm_setzero_pd();
196         fiy2             = _mm_setzero_pd();
197         fiz2             = _mm_setzero_pd();
198
199         /* Reset potential sums */
200         velecsum         = _mm_setzero_pd();
201         vvdwsum          = _mm_setzero_pd();
202
203         /* Start inner kernel loop */
204         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
205         {
206
207             /* Get j neighbor index, and coordinate index */
208             jnrA             = jjnr[jidx];
209             jnrB             = jjnr[jidx+1];
210             j_coord_offsetA  = DIM*jnrA;
211             j_coord_offsetB  = DIM*jnrB;
212
213             /* load j atom coordinates */
214             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
215                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
216
217             /* Calculate displacement vector */
218             dx00             = _mm_sub_pd(ix0,jx0);
219             dy00             = _mm_sub_pd(iy0,jy0);
220             dz00             = _mm_sub_pd(iz0,jz0);
221             dx01             = _mm_sub_pd(ix0,jx1);
222             dy01             = _mm_sub_pd(iy0,jy1);
223             dz01             = _mm_sub_pd(iz0,jz1);
224             dx02             = _mm_sub_pd(ix0,jx2);
225             dy02             = _mm_sub_pd(iy0,jy2);
226             dz02             = _mm_sub_pd(iz0,jz2);
227             dx10             = _mm_sub_pd(ix1,jx0);
228             dy10             = _mm_sub_pd(iy1,jy0);
229             dz10             = _mm_sub_pd(iz1,jz0);
230             dx11             = _mm_sub_pd(ix1,jx1);
231             dy11             = _mm_sub_pd(iy1,jy1);
232             dz11             = _mm_sub_pd(iz1,jz1);
233             dx12             = _mm_sub_pd(ix1,jx2);
234             dy12             = _mm_sub_pd(iy1,jy2);
235             dz12             = _mm_sub_pd(iz1,jz2);
236             dx20             = _mm_sub_pd(ix2,jx0);
237             dy20             = _mm_sub_pd(iy2,jy0);
238             dz20             = _mm_sub_pd(iz2,jz0);
239             dx21             = _mm_sub_pd(ix2,jx1);
240             dy21             = _mm_sub_pd(iy2,jy1);
241             dz21             = _mm_sub_pd(iz2,jz1);
242             dx22             = _mm_sub_pd(ix2,jx2);
243             dy22             = _mm_sub_pd(iy2,jy2);
244             dz22             = _mm_sub_pd(iz2,jz2);
245
246             /* Calculate squared distance and things based on it */
247             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
248             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
249             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
250             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
251             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
252             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
253             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
254             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
255             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
256
257             rinv00           = gmx_mm_invsqrt_pd(rsq00);
258             rinv01           = gmx_mm_invsqrt_pd(rsq01);
259             rinv02           = gmx_mm_invsqrt_pd(rsq02);
260             rinv10           = gmx_mm_invsqrt_pd(rsq10);
261             rinv11           = gmx_mm_invsqrt_pd(rsq11);
262             rinv12           = gmx_mm_invsqrt_pd(rsq12);
263             rinv20           = gmx_mm_invsqrt_pd(rsq20);
264             rinv21           = gmx_mm_invsqrt_pd(rsq21);
265             rinv22           = gmx_mm_invsqrt_pd(rsq22);
266
267             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
268             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
269             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
270             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
271             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
272             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
273             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
274             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
275             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
276
277             fjx0             = _mm_setzero_pd();
278             fjy0             = _mm_setzero_pd();
279             fjz0             = _mm_setzero_pd();
280             fjx1             = _mm_setzero_pd();
281             fjy1             = _mm_setzero_pd();
282             fjz1             = _mm_setzero_pd();
283             fjx2             = _mm_setzero_pd();
284             fjy2             = _mm_setzero_pd();
285             fjz2             = _mm_setzero_pd();
286
287             /**************************
288              * CALCULATE INTERACTIONS *
289              **************************/
290
291             r00              = _mm_mul_pd(rsq00,rinv00);
292
293             /* Calculate table index by multiplying r with table scale and truncate to integer */
294             rt               = _mm_mul_pd(r00,vftabscale);
295             vfitab           = _mm_cvttpd_epi32(rt);
296             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
297             vfitab           = _mm_slli_epi32(vfitab,3);
298
299             /* REACTION-FIELD ELECTROSTATICS */
300             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
301             felec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
302
303             /* CUBIC SPLINE TABLE DISPERSION */
304             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
305             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
306             GMX_MM_TRANSPOSE2_PD(Y,F);
307             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
308             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
309             GMX_MM_TRANSPOSE2_PD(G,H);
310             Heps             = _mm_mul_pd(vfeps,H);
311             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
312             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
313             vvdw6            = _mm_mul_pd(c6_00,VV);
314             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
315             fvdw6            = _mm_mul_pd(c6_00,FF);
316
317             /* CUBIC SPLINE TABLE REPULSION */
318             vfitab           = _mm_add_epi32(vfitab,ifour);
319             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
320             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
321             GMX_MM_TRANSPOSE2_PD(Y,F);
322             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
323             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
324             GMX_MM_TRANSPOSE2_PD(G,H);
325             Heps             = _mm_mul_pd(vfeps,H);
326             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
327             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
328             vvdw12           = _mm_mul_pd(c12_00,VV);
329             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
330             fvdw12           = _mm_mul_pd(c12_00,FF);
331             vvdw             = _mm_add_pd(vvdw12,vvdw6);
332             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
333
334             /* Update potential sum for this i atom from the interaction with this j atom. */
335             velecsum         = _mm_add_pd(velecsum,velec);
336             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
337
338             fscal            = _mm_add_pd(felec,fvdw);
339
340             /* Calculate temporary vectorial force */
341             tx               = _mm_mul_pd(fscal,dx00);
342             ty               = _mm_mul_pd(fscal,dy00);
343             tz               = _mm_mul_pd(fscal,dz00);
344
345             /* Update vectorial force */
346             fix0             = _mm_add_pd(fix0,tx);
347             fiy0             = _mm_add_pd(fiy0,ty);
348             fiz0             = _mm_add_pd(fiz0,tz);
349
350             fjx0             = _mm_add_pd(fjx0,tx);
351             fjy0             = _mm_add_pd(fjy0,ty);
352             fjz0             = _mm_add_pd(fjz0,tz);
353
354             /**************************
355              * CALCULATE INTERACTIONS *
356              **************************/
357
358             /* REACTION-FIELD ELECTROSTATICS */
359             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
360             felec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
361
362             /* Update potential sum for this i atom from the interaction with this j atom. */
363             velecsum         = _mm_add_pd(velecsum,velec);
364
365             fscal            = felec;
366
367             /* Calculate temporary vectorial force */
368             tx               = _mm_mul_pd(fscal,dx01);
369             ty               = _mm_mul_pd(fscal,dy01);
370             tz               = _mm_mul_pd(fscal,dz01);
371
372             /* Update vectorial force */
373             fix0             = _mm_add_pd(fix0,tx);
374             fiy0             = _mm_add_pd(fiy0,ty);
375             fiz0             = _mm_add_pd(fiz0,tz);
376
377             fjx1             = _mm_add_pd(fjx1,tx);
378             fjy1             = _mm_add_pd(fjy1,ty);
379             fjz1             = _mm_add_pd(fjz1,tz);
380
381             /**************************
382              * CALCULATE INTERACTIONS *
383              **************************/
384
385             /* REACTION-FIELD ELECTROSTATICS */
386             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
387             felec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm_add_pd(velecsum,velec);
391
392             fscal            = felec;
393
394             /* Calculate temporary vectorial force */
395             tx               = _mm_mul_pd(fscal,dx02);
396             ty               = _mm_mul_pd(fscal,dy02);
397             tz               = _mm_mul_pd(fscal,dz02);
398
399             /* Update vectorial force */
400             fix0             = _mm_add_pd(fix0,tx);
401             fiy0             = _mm_add_pd(fiy0,ty);
402             fiz0             = _mm_add_pd(fiz0,tz);
403
404             fjx2             = _mm_add_pd(fjx2,tx);
405             fjy2             = _mm_add_pd(fjy2,ty);
406             fjz2             = _mm_add_pd(fjz2,tz);
407
408             /**************************
409              * CALCULATE INTERACTIONS *
410              **************************/
411
412             /* REACTION-FIELD ELECTROSTATICS */
413             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
414             felec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
415
416             /* Update potential sum for this i atom from the interaction with this j atom. */
417             velecsum         = _mm_add_pd(velecsum,velec);
418
419             fscal            = felec;
420
421             /* Calculate temporary vectorial force */
422             tx               = _mm_mul_pd(fscal,dx10);
423             ty               = _mm_mul_pd(fscal,dy10);
424             tz               = _mm_mul_pd(fscal,dz10);
425
426             /* Update vectorial force */
427             fix1             = _mm_add_pd(fix1,tx);
428             fiy1             = _mm_add_pd(fiy1,ty);
429             fiz1             = _mm_add_pd(fiz1,tz);
430
431             fjx0             = _mm_add_pd(fjx0,tx);
432             fjy0             = _mm_add_pd(fjy0,ty);
433             fjz0             = _mm_add_pd(fjz0,tz);
434
435             /**************************
436              * CALCULATE INTERACTIONS *
437              **************************/
438
439             /* REACTION-FIELD ELECTROSTATICS */
440             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
441             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
442
443             /* Update potential sum for this i atom from the interaction with this j atom. */
444             velecsum         = _mm_add_pd(velecsum,velec);
445
446             fscal            = felec;
447
448             /* Calculate temporary vectorial force */
449             tx               = _mm_mul_pd(fscal,dx11);
450             ty               = _mm_mul_pd(fscal,dy11);
451             tz               = _mm_mul_pd(fscal,dz11);
452
453             /* Update vectorial force */
454             fix1             = _mm_add_pd(fix1,tx);
455             fiy1             = _mm_add_pd(fiy1,ty);
456             fiz1             = _mm_add_pd(fiz1,tz);
457
458             fjx1             = _mm_add_pd(fjx1,tx);
459             fjy1             = _mm_add_pd(fjy1,ty);
460             fjz1             = _mm_add_pd(fjz1,tz);
461
462             /**************************
463              * CALCULATE INTERACTIONS *
464              **************************/
465
466             /* REACTION-FIELD ELECTROSTATICS */
467             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
468             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
469
470             /* Update potential sum for this i atom from the interaction with this j atom. */
471             velecsum         = _mm_add_pd(velecsum,velec);
472
473             fscal            = felec;
474
475             /* Calculate temporary vectorial force */
476             tx               = _mm_mul_pd(fscal,dx12);
477             ty               = _mm_mul_pd(fscal,dy12);
478             tz               = _mm_mul_pd(fscal,dz12);
479
480             /* Update vectorial force */
481             fix1             = _mm_add_pd(fix1,tx);
482             fiy1             = _mm_add_pd(fiy1,ty);
483             fiz1             = _mm_add_pd(fiz1,tz);
484
485             fjx2             = _mm_add_pd(fjx2,tx);
486             fjy2             = _mm_add_pd(fjy2,ty);
487             fjz2             = _mm_add_pd(fjz2,tz);
488
489             /**************************
490              * CALCULATE INTERACTIONS *
491              **************************/
492
493             /* REACTION-FIELD ELECTROSTATICS */
494             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
495             felec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
496
497             /* Update potential sum for this i atom from the interaction with this j atom. */
498             velecsum         = _mm_add_pd(velecsum,velec);
499
500             fscal            = felec;
501
502             /* Calculate temporary vectorial force */
503             tx               = _mm_mul_pd(fscal,dx20);
504             ty               = _mm_mul_pd(fscal,dy20);
505             tz               = _mm_mul_pd(fscal,dz20);
506
507             /* Update vectorial force */
508             fix2             = _mm_add_pd(fix2,tx);
509             fiy2             = _mm_add_pd(fiy2,ty);
510             fiz2             = _mm_add_pd(fiz2,tz);
511
512             fjx0             = _mm_add_pd(fjx0,tx);
513             fjy0             = _mm_add_pd(fjy0,ty);
514             fjz0             = _mm_add_pd(fjz0,tz);
515
516             /**************************
517              * CALCULATE INTERACTIONS *
518              **************************/
519
520             /* REACTION-FIELD ELECTROSTATICS */
521             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
522             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
523
524             /* Update potential sum for this i atom from the interaction with this j atom. */
525             velecsum         = _mm_add_pd(velecsum,velec);
526
527             fscal            = felec;
528
529             /* Calculate temporary vectorial force */
530             tx               = _mm_mul_pd(fscal,dx21);
531             ty               = _mm_mul_pd(fscal,dy21);
532             tz               = _mm_mul_pd(fscal,dz21);
533
534             /* Update vectorial force */
535             fix2             = _mm_add_pd(fix2,tx);
536             fiy2             = _mm_add_pd(fiy2,ty);
537             fiz2             = _mm_add_pd(fiz2,tz);
538
539             fjx1             = _mm_add_pd(fjx1,tx);
540             fjy1             = _mm_add_pd(fjy1,ty);
541             fjz1             = _mm_add_pd(fjz1,tz);
542
543             /**************************
544              * CALCULATE INTERACTIONS *
545              **************************/
546
547             /* REACTION-FIELD ELECTROSTATICS */
548             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
549             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
550
551             /* Update potential sum for this i atom from the interaction with this j atom. */
552             velecsum         = _mm_add_pd(velecsum,velec);
553
554             fscal            = felec;
555
556             /* Calculate temporary vectorial force */
557             tx               = _mm_mul_pd(fscal,dx22);
558             ty               = _mm_mul_pd(fscal,dy22);
559             tz               = _mm_mul_pd(fscal,dz22);
560
561             /* Update vectorial force */
562             fix2             = _mm_add_pd(fix2,tx);
563             fiy2             = _mm_add_pd(fiy2,ty);
564             fiz2             = _mm_add_pd(fiz2,tz);
565
566             fjx2             = _mm_add_pd(fjx2,tx);
567             fjy2             = _mm_add_pd(fjy2,ty);
568             fjz2             = _mm_add_pd(fjz2,tz);
569
570             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
571
572             /* Inner loop uses 323 flops */
573         }
574
575         if(jidx<j_index_end)
576         {
577
578             jnrA             = jjnr[jidx];
579             j_coord_offsetA  = DIM*jnrA;
580
581             /* load j atom coordinates */
582             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
583                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
584
585             /* Calculate displacement vector */
586             dx00             = _mm_sub_pd(ix0,jx0);
587             dy00             = _mm_sub_pd(iy0,jy0);
588             dz00             = _mm_sub_pd(iz0,jz0);
589             dx01             = _mm_sub_pd(ix0,jx1);
590             dy01             = _mm_sub_pd(iy0,jy1);
591             dz01             = _mm_sub_pd(iz0,jz1);
592             dx02             = _mm_sub_pd(ix0,jx2);
593             dy02             = _mm_sub_pd(iy0,jy2);
594             dz02             = _mm_sub_pd(iz0,jz2);
595             dx10             = _mm_sub_pd(ix1,jx0);
596             dy10             = _mm_sub_pd(iy1,jy0);
597             dz10             = _mm_sub_pd(iz1,jz0);
598             dx11             = _mm_sub_pd(ix1,jx1);
599             dy11             = _mm_sub_pd(iy1,jy1);
600             dz11             = _mm_sub_pd(iz1,jz1);
601             dx12             = _mm_sub_pd(ix1,jx2);
602             dy12             = _mm_sub_pd(iy1,jy2);
603             dz12             = _mm_sub_pd(iz1,jz2);
604             dx20             = _mm_sub_pd(ix2,jx0);
605             dy20             = _mm_sub_pd(iy2,jy0);
606             dz20             = _mm_sub_pd(iz2,jz0);
607             dx21             = _mm_sub_pd(ix2,jx1);
608             dy21             = _mm_sub_pd(iy2,jy1);
609             dz21             = _mm_sub_pd(iz2,jz1);
610             dx22             = _mm_sub_pd(ix2,jx2);
611             dy22             = _mm_sub_pd(iy2,jy2);
612             dz22             = _mm_sub_pd(iz2,jz2);
613
614             /* Calculate squared distance and things based on it */
615             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
616             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
617             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
618             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
619             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
620             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
621             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
622             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
623             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
624
625             rinv00           = gmx_mm_invsqrt_pd(rsq00);
626             rinv01           = gmx_mm_invsqrt_pd(rsq01);
627             rinv02           = gmx_mm_invsqrt_pd(rsq02);
628             rinv10           = gmx_mm_invsqrt_pd(rsq10);
629             rinv11           = gmx_mm_invsqrt_pd(rsq11);
630             rinv12           = gmx_mm_invsqrt_pd(rsq12);
631             rinv20           = gmx_mm_invsqrt_pd(rsq20);
632             rinv21           = gmx_mm_invsqrt_pd(rsq21);
633             rinv22           = gmx_mm_invsqrt_pd(rsq22);
634
635             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
636             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
637             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
638             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
639             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
640             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
641             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
642             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
643             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
644
645             fjx0             = _mm_setzero_pd();
646             fjy0             = _mm_setzero_pd();
647             fjz0             = _mm_setzero_pd();
648             fjx1             = _mm_setzero_pd();
649             fjy1             = _mm_setzero_pd();
650             fjz1             = _mm_setzero_pd();
651             fjx2             = _mm_setzero_pd();
652             fjy2             = _mm_setzero_pd();
653             fjz2             = _mm_setzero_pd();
654
655             /**************************
656              * CALCULATE INTERACTIONS *
657              **************************/
658
659             r00              = _mm_mul_pd(rsq00,rinv00);
660
661             /* Calculate table index by multiplying r with table scale and truncate to integer */
662             rt               = _mm_mul_pd(r00,vftabscale);
663             vfitab           = _mm_cvttpd_epi32(rt);
664             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
665             vfitab           = _mm_slli_epi32(vfitab,3);
666
667             /* REACTION-FIELD ELECTROSTATICS */
668             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
669             felec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
670
671             /* CUBIC SPLINE TABLE DISPERSION */
672             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
673             F                = _mm_setzero_pd();
674             GMX_MM_TRANSPOSE2_PD(Y,F);
675             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
676             H                = _mm_setzero_pd();
677             GMX_MM_TRANSPOSE2_PD(G,H);
678             Heps             = _mm_mul_pd(vfeps,H);
679             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
680             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
681             vvdw6            = _mm_mul_pd(c6_00,VV);
682             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
683             fvdw6            = _mm_mul_pd(c6_00,FF);
684
685             /* CUBIC SPLINE TABLE REPULSION */
686             vfitab           = _mm_add_epi32(vfitab,ifour);
687             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
688             F                = _mm_setzero_pd();
689             GMX_MM_TRANSPOSE2_PD(Y,F);
690             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
691             H                = _mm_setzero_pd();
692             GMX_MM_TRANSPOSE2_PD(G,H);
693             Heps             = _mm_mul_pd(vfeps,H);
694             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
695             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
696             vvdw12           = _mm_mul_pd(c12_00,VV);
697             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
698             fvdw12           = _mm_mul_pd(c12_00,FF);
699             vvdw             = _mm_add_pd(vvdw12,vvdw6);
700             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
701
702             /* Update potential sum for this i atom from the interaction with this j atom. */
703             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
704             velecsum         = _mm_add_pd(velecsum,velec);
705             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
706             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
707
708             fscal            = _mm_add_pd(felec,fvdw);
709
710             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
711
712             /* Calculate temporary vectorial force */
713             tx               = _mm_mul_pd(fscal,dx00);
714             ty               = _mm_mul_pd(fscal,dy00);
715             tz               = _mm_mul_pd(fscal,dz00);
716
717             /* Update vectorial force */
718             fix0             = _mm_add_pd(fix0,tx);
719             fiy0             = _mm_add_pd(fiy0,ty);
720             fiz0             = _mm_add_pd(fiz0,tz);
721
722             fjx0             = _mm_add_pd(fjx0,tx);
723             fjy0             = _mm_add_pd(fjy0,ty);
724             fjz0             = _mm_add_pd(fjz0,tz);
725
726             /**************************
727              * CALCULATE INTERACTIONS *
728              **************************/
729
730             /* REACTION-FIELD ELECTROSTATICS */
731             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
732             felec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
733
734             /* Update potential sum for this i atom from the interaction with this j atom. */
735             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
736             velecsum         = _mm_add_pd(velecsum,velec);
737
738             fscal            = felec;
739
740             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
741
742             /* Calculate temporary vectorial force */
743             tx               = _mm_mul_pd(fscal,dx01);
744             ty               = _mm_mul_pd(fscal,dy01);
745             tz               = _mm_mul_pd(fscal,dz01);
746
747             /* Update vectorial force */
748             fix0             = _mm_add_pd(fix0,tx);
749             fiy0             = _mm_add_pd(fiy0,ty);
750             fiz0             = _mm_add_pd(fiz0,tz);
751
752             fjx1             = _mm_add_pd(fjx1,tx);
753             fjy1             = _mm_add_pd(fjy1,ty);
754             fjz1             = _mm_add_pd(fjz1,tz);
755
756             /**************************
757              * CALCULATE INTERACTIONS *
758              **************************/
759
760             /* REACTION-FIELD ELECTROSTATICS */
761             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
762             felec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
763
764             /* Update potential sum for this i atom from the interaction with this j atom. */
765             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
766             velecsum         = _mm_add_pd(velecsum,velec);
767
768             fscal            = felec;
769
770             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
771
772             /* Calculate temporary vectorial force */
773             tx               = _mm_mul_pd(fscal,dx02);
774             ty               = _mm_mul_pd(fscal,dy02);
775             tz               = _mm_mul_pd(fscal,dz02);
776
777             /* Update vectorial force */
778             fix0             = _mm_add_pd(fix0,tx);
779             fiy0             = _mm_add_pd(fiy0,ty);
780             fiz0             = _mm_add_pd(fiz0,tz);
781
782             fjx2             = _mm_add_pd(fjx2,tx);
783             fjy2             = _mm_add_pd(fjy2,ty);
784             fjz2             = _mm_add_pd(fjz2,tz);
785
786             /**************************
787              * CALCULATE INTERACTIONS *
788              **************************/
789
790             /* REACTION-FIELD ELECTROSTATICS */
791             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
792             felec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
793
794             /* Update potential sum for this i atom from the interaction with this j atom. */
795             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
796             velecsum         = _mm_add_pd(velecsum,velec);
797
798             fscal            = felec;
799
800             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
801
802             /* Calculate temporary vectorial force */
803             tx               = _mm_mul_pd(fscal,dx10);
804             ty               = _mm_mul_pd(fscal,dy10);
805             tz               = _mm_mul_pd(fscal,dz10);
806
807             /* Update vectorial force */
808             fix1             = _mm_add_pd(fix1,tx);
809             fiy1             = _mm_add_pd(fiy1,ty);
810             fiz1             = _mm_add_pd(fiz1,tz);
811
812             fjx0             = _mm_add_pd(fjx0,tx);
813             fjy0             = _mm_add_pd(fjy0,ty);
814             fjz0             = _mm_add_pd(fjz0,tz);
815
816             /**************************
817              * CALCULATE INTERACTIONS *
818              **************************/
819
820             /* REACTION-FIELD ELECTROSTATICS */
821             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
822             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
823
824             /* Update potential sum for this i atom from the interaction with this j atom. */
825             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
826             velecsum         = _mm_add_pd(velecsum,velec);
827
828             fscal            = felec;
829
830             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
831
832             /* Calculate temporary vectorial force */
833             tx               = _mm_mul_pd(fscal,dx11);
834             ty               = _mm_mul_pd(fscal,dy11);
835             tz               = _mm_mul_pd(fscal,dz11);
836
837             /* Update vectorial force */
838             fix1             = _mm_add_pd(fix1,tx);
839             fiy1             = _mm_add_pd(fiy1,ty);
840             fiz1             = _mm_add_pd(fiz1,tz);
841
842             fjx1             = _mm_add_pd(fjx1,tx);
843             fjy1             = _mm_add_pd(fjy1,ty);
844             fjz1             = _mm_add_pd(fjz1,tz);
845
846             /**************************
847              * CALCULATE INTERACTIONS *
848              **************************/
849
850             /* REACTION-FIELD ELECTROSTATICS */
851             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
852             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
853
854             /* Update potential sum for this i atom from the interaction with this j atom. */
855             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
856             velecsum         = _mm_add_pd(velecsum,velec);
857
858             fscal            = felec;
859
860             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
861
862             /* Calculate temporary vectorial force */
863             tx               = _mm_mul_pd(fscal,dx12);
864             ty               = _mm_mul_pd(fscal,dy12);
865             tz               = _mm_mul_pd(fscal,dz12);
866
867             /* Update vectorial force */
868             fix1             = _mm_add_pd(fix1,tx);
869             fiy1             = _mm_add_pd(fiy1,ty);
870             fiz1             = _mm_add_pd(fiz1,tz);
871
872             fjx2             = _mm_add_pd(fjx2,tx);
873             fjy2             = _mm_add_pd(fjy2,ty);
874             fjz2             = _mm_add_pd(fjz2,tz);
875
876             /**************************
877              * CALCULATE INTERACTIONS *
878              **************************/
879
880             /* REACTION-FIELD ELECTROSTATICS */
881             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
882             felec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
883
884             /* Update potential sum for this i atom from the interaction with this j atom. */
885             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
886             velecsum         = _mm_add_pd(velecsum,velec);
887
888             fscal            = felec;
889
890             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
891
892             /* Calculate temporary vectorial force */
893             tx               = _mm_mul_pd(fscal,dx20);
894             ty               = _mm_mul_pd(fscal,dy20);
895             tz               = _mm_mul_pd(fscal,dz20);
896
897             /* Update vectorial force */
898             fix2             = _mm_add_pd(fix2,tx);
899             fiy2             = _mm_add_pd(fiy2,ty);
900             fiz2             = _mm_add_pd(fiz2,tz);
901
902             fjx0             = _mm_add_pd(fjx0,tx);
903             fjy0             = _mm_add_pd(fjy0,ty);
904             fjz0             = _mm_add_pd(fjz0,tz);
905
906             /**************************
907              * CALCULATE INTERACTIONS *
908              **************************/
909
910             /* REACTION-FIELD ELECTROSTATICS */
911             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
912             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
913
914             /* Update potential sum for this i atom from the interaction with this j atom. */
915             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
916             velecsum         = _mm_add_pd(velecsum,velec);
917
918             fscal            = felec;
919
920             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
921
922             /* Calculate temporary vectorial force */
923             tx               = _mm_mul_pd(fscal,dx21);
924             ty               = _mm_mul_pd(fscal,dy21);
925             tz               = _mm_mul_pd(fscal,dz21);
926
927             /* Update vectorial force */
928             fix2             = _mm_add_pd(fix2,tx);
929             fiy2             = _mm_add_pd(fiy2,ty);
930             fiz2             = _mm_add_pd(fiz2,tz);
931
932             fjx1             = _mm_add_pd(fjx1,tx);
933             fjy1             = _mm_add_pd(fjy1,ty);
934             fjz1             = _mm_add_pd(fjz1,tz);
935
936             /**************************
937              * CALCULATE INTERACTIONS *
938              **************************/
939
940             /* REACTION-FIELD ELECTROSTATICS */
941             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
942             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
943
944             /* Update potential sum for this i atom from the interaction with this j atom. */
945             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
946             velecsum         = _mm_add_pd(velecsum,velec);
947
948             fscal            = felec;
949
950             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
951
952             /* Calculate temporary vectorial force */
953             tx               = _mm_mul_pd(fscal,dx22);
954             ty               = _mm_mul_pd(fscal,dy22);
955             tz               = _mm_mul_pd(fscal,dz22);
956
957             /* Update vectorial force */
958             fix2             = _mm_add_pd(fix2,tx);
959             fiy2             = _mm_add_pd(fiy2,ty);
960             fiz2             = _mm_add_pd(fiz2,tz);
961
962             fjx2             = _mm_add_pd(fjx2,tx);
963             fjy2             = _mm_add_pd(fjy2,ty);
964             fjz2             = _mm_add_pd(fjz2,tz);
965
966             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
967
968             /* Inner loop uses 323 flops */
969         }
970
971         /* End of innermost loop */
972
973         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
974                                               f+i_coord_offset,fshift+i_shift_offset);
975
976         ggid                        = gid[iidx];
977         /* Update potential energies */
978         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
979         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
980
981         /* Increment number of inner iterations */
982         inneriter                  += j_index_end - j_index_start;
983
984         /* Outer loop uses 20 flops */
985     }
986
987     /* Increment number of outer iterations */
988     outeriter        += nri;
989
990     /* Update outer/inner flops */
991
992     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*323);
993 }
994 /*
995  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double
996  * Electrostatics interaction: ReactionField
997  * VdW interaction:            CubicSplineTable
998  * Geometry:                   Water3-Water3
999  * Calculate force/pot:        Force
1000  */
1001 void
1002 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double
1003                     (t_nblist                    * gmx_restrict       nlist,
1004                      rvec                        * gmx_restrict          xx,
1005                      rvec                        * gmx_restrict          ff,
1006                      t_forcerec                  * gmx_restrict          fr,
1007                      t_mdatoms                   * gmx_restrict     mdatoms,
1008                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1009                      t_nrnb                      * gmx_restrict        nrnb)
1010 {
1011     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1012      * just 0 for non-waters.
1013      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1014      * jnr indices corresponding to data put in the four positions in the SIMD register.
1015      */
1016     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1017     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1018     int              jnrA,jnrB;
1019     int              j_coord_offsetA,j_coord_offsetB;
1020     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1021     real             rcutoff_scalar;
1022     real             *shiftvec,*fshift,*x,*f;
1023     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1024     int              vdwioffset0;
1025     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1026     int              vdwioffset1;
1027     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1028     int              vdwioffset2;
1029     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1030     int              vdwjidx0A,vdwjidx0B;
1031     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1032     int              vdwjidx1A,vdwjidx1B;
1033     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1034     int              vdwjidx2A,vdwjidx2B;
1035     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1036     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1037     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1038     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1039     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1040     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1041     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1042     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1043     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1044     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1045     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1046     real             *charge;
1047     int              nvdwtype;
1048     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1049     int              *vdwtype;
1050     real             *vdwparam;
1051     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1052     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1053     __m128i          vfitab;
1054     __m128i          ifour       = _mm_set1_epi32(4);
1055     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1056     real             *vftab;
1057     __m128d          dummy_mask,cutoff_mask;
1058     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1059     __m128d          one     = _mm_set1_pd(1.0);
1060     __m128d          two     = _mm_set1_pd(2.0);
1061     x                = xx[0];
1062     f                = ff[0];
1063
1064     nri              = nlist->nri;
1065     iinr             = nlist->iinr;
1066     jindex           = nlist->jindex;
1067     jjnr             = nlist->jjnr;
1068     shiftidx         = nlist->shift;
1069     gid              = nlist->gid;
1070     shiftvec         = fr->shift_vec[0];
1071     fshift           = fr->fshift[0];
1072     facel            = _mm_set1_pd(fr->epsfac);
1073     charge           = mdatoms->chargeA;
1074     krf              = _mm_set1_pd(fr->ic->k_rf);
1075     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
1076     crf              = _mm_set1_pd(fr->ic->c_rf);
1077     nvdwtype         = fr->ntype;
1078     vdwparam         = fr->nbfp;
1079     vdwtype          = mdatoms->typeA;
1080
1081     vftab            = kernel_data->table_vdw->data;
1082     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
1083
1084     /* Setup water-specific parameters */
1085     inr              = nlist->iinr[0];
1086     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1087     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1088     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1089     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1090
1091     jq0              = _mm_set1_pd(charge[inr+0]);
1092     jq1              = _mm_set1_pd(charge[inr+1]);
1093     jq2              = _mm_set1_pd(charge[inr+2]);
1094     vdwjidx0A        = 2*vdwtype[inr+0];
1095     qq00             = _mm_mul_pd(iq0,jq0);
1096     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1097     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1098     qq01             = _mm_mul_pd(iq0,jq1);
1099     qq02             = _mm_mul_pd(iq0,jq2);
1100     qq10             = _mm_mul_pd(iq1,jq0);
1101     qq11             = _mm_mul_pd(iq1,jq1);
1102     qq12             = _mm_mul_pd(iq1,jq2);
1103     qq20             = _mm_mul_pd(iq2,jq0);
1104     qq21             = _mm_mul_pd(iq2,jq1);
1105     qq22             = _mm_mul_pd(iq2,jq2);
1106
1107     /* Avoid stupid compiler warnings */
1108     jnrA = jnrB = 0;
1109     j_coord_offsetA = 0;
1110     j_coord_offsetB = 0;
1111
1112     outeriter        = 0;
1113     inneriter        = 0;
1114
1115     /* Start outer loop over neighborlists */
1116     for(iidx=0; iidx<nri; iidx++)
1117     {
1118         /* Load shift vector for this list */
1119         i_shift_offset   = DIM*shiftidx[iidx];
1120
1121         /* Load limits for loop over neighbors */
1122         j_index_start    = jindex[iidx];
1123         j_index_end      = jindex[iidx+1];
1124
1125         /* Get outer coordinate index */
1126         inr              = iinr[iidx];
1127         i_coord_offset   = DIM*inr;
1128
1129         /* Load i particle coords and add shift vector */
1130         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1131                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1132
1133         fix0             = _mm_setzero_pd();
1134         fiy0             = _mm_setzero_pd();
1135         fiz0             = _mm_setzero_pd();
1136         fix1             = _mm_setzero_pd();
1137         fiy1             = _mm_setzero_pd();
1138         fiz1             = _mm_setzero_pd();
1139         fix2             = _mm_setzero_pd();
1140         fiy2             = _mm_setzero_pd();
1141         fiz2             = _mm_setzero_pd();
1142
1143         /* Start inner kernel loop */
1144         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1145         {
1146
1147             /* Get j neighbor index, and coordinate index */
1148             jnrA             = jjnr[jidx];
1149             jnrB             = jjnr[jidx+1];
1150             j_coord_offsetA  = DIM*jnrA;
1151             j_coord_offsetB  = DIM*jnrB;
1152
1153             /* load j atom coordinates */
1154             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1155                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1156
1157             /* Calculate displacement vector */
1158             dx00             = _mm_sub_pd(ix0,jx0);
1159             dy00             = _mm_sub_pd(iy0,jy0);
1160             dz00             = _mm_sub_pd(iz0,jz0);
1161             dx01             = _mm_sub_pd(ix0,jx1);
1162             dy01             = _mm_sub_pd(iy0,jy1);
1163             dz01             = _mm_sub_pd(iz0,jz1);
1164             dx02             = _mm_sub_pd(ix0,jx2);
1165             dy02             = _mm_sub_pd(iy0,jy2);
1166             dz02             = _mm_sub_pd(iz0,jz2);
1167             dx10             = _mm_sub_pd(ix1,jx0);
1168             dy10             = _mm_sub_pd(iy1,jy0);
1169             dz10             = _mm_sub_pd(iz1,jz0);
1170             dx11             = _mm_sub_pd(ix1,jx1);
1171             dy11             = _mm_sub_pd(iy1,jy1);
1172             dz11             = _mm_sub_pd(iz1,jz1);
1173             dx12             = _mm_sub_pd(ix1,jx2);
1174             dy12             = _mm_sub_pd(iy1,jy2);
1175             dz12             = _mm_sub_pd(iz1,jz2);
1176             dx20             = _mm_sub_pd(ix2,jx0);
1177             dy20             = _mm_sub_pd(iy2,jy0);
1178             dz20             = _mm_sub_pd(iz2,jz0);
1179             dx21             = _mm_sub_pd(ix2,jx1);
1180             dy21             = _mm_sub_pd(iy2,jy1);
1181             dz21             = _mm_sub_pd(iz2,jz1);
1182             dx22             = _mm_sub_pd(ix2,jx2);
1183             dy22             = _mm_sub_pd(iy2,jy2);
1184             dz22             = _mm_sub_pd(iz2,jz2);
1185
1186             /* Calculate squared distance and things based on it */
1187             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1188             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1189             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1190             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1191             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1192             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1193             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1194             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1195             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1196
1197             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1198             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1199             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1200             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1201             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1202             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1203             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1204             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1205             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1206
1207             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1208             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1209             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1210             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1211             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1212             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1213             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1214             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1215             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1216
1217             fjx0             = _mm_setzero_pd();
1218             fjy0             = _mm_setzero_pd();
1219             fjz0             = _mm_setzero_pd();
1220             fjx1             = _mm_setzero_pd();
1221             fjy1             = _mm_setzero_pd();
1222             fjz1             = _mm_setzero_pd();
1223             fjx2             = _mm_setzero_pd();
1224             fjy2             = _mm_setzero_pd();
1225             fjz2             = _mm_setzero_pd();
1226
1227             /**************************
1228              * CALCULATE INTERACTIONS *
1229              **************************/
1230
1231             r00              = _mm_mul_pd(rsq00,rinv00);
1232
1233             /* Calculate table index by multiplying r with table scale and truncate to integer */
1234             rt               = _mm_mul_pd(r00,vftabscale);
1235             vfitab           = _mm_cvttpd_epi32(rt);
1236             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1237             vfitab           = _mm_slli_epi32(vfitab,3);
1238
1239             /* REACTION-FIELD ELECTROSTATICS */
1240             felec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1241
1242             /* CUBIC SPLINE TABLE DISPERSION */
1243             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1244             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1245             GMX_MM_TRANSPOSE2_PD(Y,F);
1246             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1247             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1248             GMX_MM_TRANSPOSE2_PD(G,H);
1249             Heps             = _mm_mul_pd(vfeps,H);
1250             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1251             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1252             fvdw6            = _mm_mul_pd(c6_00,FF);
1253
1254             /* CUBIC SPLINE TABLE REPULSION */
1255             vfitab           = _mm_add_epi32(vfitab,ifour);
1256             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1257             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1258             GMX_MM_TRANSPOSE2_PD(Y,F);
1259             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1260             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1261             GMX_MM_TRANSPOSE2_PD(G,H);
1262             Heps             = _mm_mul_pd(vfeps,H);
1263             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1264             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1265             fvdw12           = _mm_mul_pd(c12_00,FF);
1266             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1267
1268             fscal            = _mm_add_pd(felec,fvdw);
1269
1270             /* Calculate temporary vectorial force */
1271             tx               = _mm_mul_pd(fscal,dx00);
1272             ty               = _mm_mul_pd(fscal,dy00);
1273             tz               = _mm_mul_pd(fscal,dz00);
1274
1275             /* Update vectorial force */
1276             fix0             = _mm_add_pd(fix0,tx);
1277             fiy0             = _mm_add_pd(fiy0,ty);
1278             fiz0             = _mm_add_pd(fiz0,tz);
1279
1280             fjx0             = _mm_add_pd(fjx0,tx);
1281             fjy0             = _mm_add_pd(fjy0,ty);
1282             fjz0             = _mm_add_pd(fjz0,tz);
1283
1284             /**************************
1285              * CALCULATE INTERACTIONS *
1286              **************************/
1287
1288             /* REACTION-FIELD ELECTROSTATICS */
1289             felec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1290
1291             fscal            = felec;
1292
1293             /* Calculate temporary vectorial force */
1294             tx               = _mm_mul_pd(fscal,dx01);
1295             ty               = _mm_mul_pd(fscal,dy01);
1296             tz               = _mm_mul_pd(fscal,dz01);
1297
1298             /* Update vectorial force */
1299             fix0             = _mm_add_pd(fix0,tx);
1300             fiy0             = _mm_add_pd(fiy0,ty);
1301             fiz0             = _mm_add_pd(fiz0,tz);
1302
1303             fjx1             = _mm_add_pd(fjx1,tx);
1304             fjy1             = _mm_add_pd(fjy1,ty);
1305             fjz1             = _mm_add_pd(fjz1,tz);
1306
1307             /**************************
1308              * CALCULATE INTERACTIONS *
1309              **************************/
1310
1311             /* REACTION-FIELD ELECTROSTATICS */
1312             felec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1313
1314             fscal            = felec;
1315
1316             /* Calculate temporary vectorial force */
1317             tx               = _mm_mul_pd(fscal,dx02);
1318             ty               = _mm_mul_pd(fscal,dy02);
1319             tz               = _mm_mul_pd(fscal,dz02);
1320
1321             /* Update vectorial force */
1322             fix0             = _mm_add_pd(fix0,tx);
1323             fiy0             = _mm_add_pd(fiy0,ty);
1324             fiz0             = _mm_add_pd(fiz0,tz);
1325
1326             fjx2             = _mm_add_pd(fjx2,tx);
1327             fjy2             = _mm_add_pd(fjy2,ty);
1328             fjz2             = _mm_add_pd(fjz2,tz);
1329
1330             /**************************
1331              * CALCULATE INTERACTIONS *
1332              **************************/
1333
1334             /* REACTION-FIELD ELECTROSTATICS */
1335             felec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1336
1337             fscal            = felec;
1338
1339             /* Calculate temporary vectorial force */
1340             tx               = _mm_mul_pd(fscal,dx10);
1341             ty               = _mm_mul_pd(fscal,dy10);
1342             tz               = _mm_mul_pd(fscal,dz10);
1343
1344             /* Update vectorial force */
1345             fix1             = _mm_add_pd(fix1,tx);
1346             fiy1             = _mm_add_pd(fiy1,ty);
1347             fiz1             = _mm_add_pd(fiz1,tz);
1348
1349             fjx0             = _mm_add_pd(fjx0,tx);
1350             fjy0             = _mm_add_pd(fjy0,ty);
1351             fjz0             = _mm_add_pd(fjz0,tz);
1352
1353             /**************************
1354              * CALCULATE INTERACTIONS *
1355              **************************/
1356
1357             /* REACTION-FIELD ELECTROSTATICS */
1358             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1359
1360             fscal            = felec;
1361
1362             /* Calculate temporary vectorial force */
1363             tx               = _mm_mul_pd(fscal,dx11);
1364             ty               = _mm_mul_pd(fscal,dy11);
1365             tz               = _mm_mul_pd(fscal,dz11);
1366
1367             /* Update vectorial force */
1368             fix1             = _mm_add_pd(fix1,tx);
1369             fiy1             = _mm_add_pd(fiy1,ty);
1370             fiz1             = _mm_add_pd(fiz1,tz);
1371
1372             fjx1             = _mm_add_pd(fjx1,tx);
1373             fjy1             = _mm_add_pd(fjy1,ty);
1374             fjz1             = _mm_add_pd(fjz1,tz);
1375
1376             /**************************
1377              * CALCULATE INTERACTIONS *
1378              **************************/
1379
1380             /* REACTION-FIELD ELECTROSTATICS */
1381             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1382
1383             fscal            = felec;
1384
1385             /* Calculate temporary vectorial force */
1386             tx               = _mm_mul_pd(fscal,dx12);
1387             ty               = _mm_mul_pd(fscal,dy12);
1388             tz               = _mm_mul_pd(fscal,dz12);
1389
1390             /* Update vectorial force */
1391             fix1             = _mm_add_pd(fix1,tx);
1392             fiy1             = _mm_add_pd(fiy1,ty);
1393             fiz1             = _mm_add_pd(fiz1,tz);
1394
1395             fjx2             = _mm_add_pd(fjx2,tx);
1396             fjy2             = _mm_add_pd(fjy2,ty);
1397             fjz2             = _mm_add_pd(fjz2,tz);
1398
1399             /**************************
1400              * CALCULATE INTERACTIONS *
1401              **************************/
1402
1403             /* REACTION-FIELD ELECTROSTATICS */
1404             felec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1405
1406             fscal            = felec;
1407
1408             /* Calculate temporary vectorial force */
1409             tx               = _mm_mul_pd(fscal,dx20);
1410             ty               = _mm_mul_pd(fscal,dy20);
1411             tz               = _mm_mul_pd(fscal,dz20);
1412
1413             /* Update vectorial force */
1414             fix2             = _mm_add_pd(fix2,tx);
1415             fiy2             = _mm_add_pd(fiy2,ty);
1416             fiz2             = _mm_add_pd(fiz2,tz);
1417
1418             fjx0             = _mm_add_pd(fjx0,tx);
1419             fjy0             = _mm_add_pd(fjy0,ty);
1420             fjz0             = _mm_add_pd(fjz0,tz);
1421
1422             /**************************
1423              * CALCULATE INTERACTIONS *
1424              **************************/
1425
1426             /* REACTION-FIELD ELECTROSTATICS */
1427             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1428
1429             fscal            = felec;
1430
1431             /* Calculate temporary vectorial force */
1432             tx               = _mm_mul_pd(fscal,dx21);
1433             ty               = _mm_mul_pd(fscal,dy21);
1434             tz               = _mm_mul_pd(fscal,dz21);
1435
1436             /* Update vectorial force */
1437             fix2             = _mm_add_pd(fix2,tx);
1438             fiy2             = _mm_add_pd(fiy2,ty);
1439             fiz2             = _mm_add_pd(fiz2,tz);
1440
1441             fjx1             = _mm_add_pd(fjx1,tx);
1442             fjy1             = _mm_add_pd(fjy1,ty);
1443             fjz1             = _mm_add_pd(fjz1,tz);
1444
1445             /**************************
1446              * CALCULATE INTERACTIONS *
1447              **************************/
1448
1449             /* REACTION-FIELD ELECTROSTATICS */
1450             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1451
1452             fscal            = felec;
1453
1454             /* Calculate temporary vectorial force */
1455             tx               = _mm_mul_pd(fscal,dx22);
1456             ty               = _mm_mul_pd(fscal,dy22);
1457             tz               = _mm_mul_pd(fscal,dz22);
1458
1459             /* Update vectorial force */
1460             fix2             = _mm_add_pd(fix2,tx);
1461             fiy2             = _mm_add_pd(fiy2,ty);
1462             fiz2             = _mm_add_pd(fiz2,tz);
1463
1464             fjx2             = _mm_add_pd(fjx2,tx);
1465             fjy2             = _mm_add_pd(fjy2,ty);
1466             fjz2             = _mm_add_pd(fjz2,tz);
1467
1468             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1469
1470             /* Inner loop uses 270 flops */
1471         }
1472
1473         if(jidx<j_index_end)
1474         {
1475
1476             jnrA             = jjnr[jidx];
1477             j_coord_offsetA  = DIM*jnrA;
1478
1479             /* load j atom coordinates */
1480             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1481                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1482
1483             /* Calculate displacement vector */
1484             dx00             = _mm_sub_pd(ix0,jx0);
1485             dy00             = _mm_sub_pd(iy0,jy0);
1486             dz00             = _mm_sub_pd(iz0,jz0);
1487             dx01             = _mm_sub_pd(ix0,jx1);
1488             dy01             = _mm_sub_pd(iy0,jy1);
1489             dz01             = _mm_sub_pd(iz0,jz1);
1490             dx02             = _mm_sub_pd(ix0,jx2);
1491             dy02             = _mm_sub_pd(iy0,jy2);
1492             dz02             = _mm_sub_pd(iz0,jz2);
1493             dx10             = _mm_sub_pd(ix1,jx0);
1494             dy10             = _mm_sub_pd(iy1,jy0);
1495             dz10             = _mm_sub_pd(iz1,jz0);
1496             dx11             = _mm_sub_pd(ix1,jx1);
1497             dy11             = _mm_sub_pd(iy1,jy1);
1498             dz11             = _mm_sub_pd(iz1,jz1);
1499             dx12             = _mm_sub_pd(ix1,jx2);
1500             dy12             = _mm_sub_pd(iy1,jy2);
1501             dz12             = _mm_sub_pd(iz1,jz2);
1502             dx20             = _mm_sub_pd(ix2,jx0);
1503             dy20             = _mm_sub_pd(iy2,jy0);
1504             dz20             = _mm_sub_pd(iz2,jz0);
1505             dx21             = _mm_sub_pd(ix2,jx1);
1506             dy21             = _mm_sub_pd(iy2,jy1);
1507             dz21             = _mm_sub_pd(iz2,jz1);
1508             dx22             = _mm_sub_pd(ix2,jx2);
1509             dy22             = _mm_sub_pd(iy2,jy2);
1510             dz22             = _mm_sub_pd(iz2,jz2);
1511
1512             /* Calculate squared distance and things based on it */
1513             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1514             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1515             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1516             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1517             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1518             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1519             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1520             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1521             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1522
1523             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1524             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1525             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1526             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1527             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1528             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1529             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1530             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1531             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1532
1533             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1534             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1535             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1536             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1537             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1538             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1539             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1540             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1541             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1542
1543             fjx0             = _mm_setzero_pd();
1544             fjy0             = _mm_setzero_pd();
1545             fjz0             = _mm_setzero_pd();
1546             fjx1             = _mm_setzero_pd();
1547             fjy1             = _mm_setzero_pd();
1548             fjz1             = _mm_setzero_pd();
1549             fjx2             = _mm_setzero_pd();
1550             fjy2             = _mm_setzero_pd();
1551             fjz2             = _mm_setzero_pd();
1552
1553             /**************************
1554              * CALCULATE INTERACTIONS *
1555              **************************/
1556
1557             r00              = _mm_mul_pd(rsq00,rinv00);
1558
1559             /* Calculate table index by multiplying r with table scale and truncate to integer */
1560             rt               = _mm_mul_pd(r00,vftabscale);
1561             vfitab           = _mm_cvttpd_epi32(rt);
1562             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1563             vfitab           = _mm_slli_epi32(vfitab,3);
1564
1565             /* REACTION-FIELD ELECTROSTATICS */
1566             felec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1567
1568             /* CUBIC SPLINE TABLE DISPERSION */
1569             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1570             F                = _mm_setzero_pd();
1571             GMX_MM_TRANSPOSE2_PD(Y,F);
1572             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1573             H                = _mm_setzero_pd();
1574             GMX_MM_TRANSPOSE2_PD(G,H);
1575             Heps             = _mm_mul_pd(vfeps,H);
1576             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1577             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1578             fvdw6            = _mm_mul_pd(c6_00,FF);
1579
1580             /* CUBIC SPLINE TABLE REPULSION */
1581             vfitab           = _mm_add_epi32(vfitab,ifour);
1582             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1583             F                = _mm_setzero_pd();
1584             GMX_MM_TRANSPOSE2_PD(Y,F);
1585             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1586             H                = _mm_setzero_pd();
1587             GMX_MM_TRANSPOSE2_PD(G,H);
1588             Heps             = _mm_mul_pd(vfeps,H);
1589             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1590             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1591             fvdw12           = _mm_mul_pd(c12_00,FF);
1592             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1593
1594             fscal            = _mm_add_pd(felec,fvdw);
1595
1596             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1597
1598             /* Calculate temporary vectorial force */
1599             tx               = _mm_mul_pd(fscal,dx00);
1600             ty               = _mm_mul_pd(fscal,dy00);
1601             tz               = _mm_mul_pd(fscal,dz00);
1602
1603             /* Update vectorial force */
1604             fix0             = _mm_add_pd(fix0,tx);
1605             fiy0             = _mm_add_pd(fiy0,ty);
1606             fiz0             = _mm_add_pd(fiz0,tz);
1607
1608             fjx0             = _mm_add_pd(fjx0,tx);
1609             fjy0             = _mm_add_pd(fjy0,ty);
1610             fjz0             = _mm_add_pd(fjz0,tz);
1611
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             /* REACTION-FIELD ELECTROSTATICS */
1617             felec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1618
1619             fscal            = felec;
1620
1621             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1622
1623             /* Calculate temporary vectorial force */
1624             tx               = _mm_mul_pd(fscal,dx01);
1625             ty               = _mm_mul_pd(fscal,dy01);
1626             tz               = _mm_mul_pd(fscal,dz01);
1627
1628             /* Update vectorial force */
1629             fix0             = _mm_add_pd(fix0,tx);
1630             fiy0             = _mm_add_pd(fiy0,ty);
1631             fiz0             = _mm_add_pd(fiz0,tz);
1632
1633             fjx1             = _mm_add_pd(fjx1,tx);
1634             fjy1             = _mm_add_pd(fjy1,ty);
1635             fjz1             = _mm_add_pd(fjz1,tz);
1636
1637             /**************************
1638              * CALCULATE INTERACTIONS *
1639              **************************/
1640
1641             /* REACTION-FIELD ELECTROSTATICS */
1642             felec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1643
1644             fscal            = felec;
1645
1646             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1647
1648             /* Calculate temporary vectorial force */
1649             tx               = _mm_mul_pd(fscal,dx02);
1650             ty               = _mm_mul_pd(fscal,dy02);
1651             tz               = _mm_mul_pd(fscal,dz02);
1652
1653             /* Update vectorial force */
1654             fix0             = _mm_add_pd(fix0,tx);
1655             fiy0             = _mm_add_pd(fiy0,ty);
1656             fiz0             = _mm_add_pd(fiz0,tz);
1657
1658             fjx2             = _mm_add_pd(fjx2,tx);
1659             fjy2             = _mm_add_pd(fjy2,ty);
1660             fjz2             = _mm_add_pd(fjz2,tz);
1661
1662             /**************************
1663              * CALCULATE INTERACTIONS *
1664              **************************/
1665
1666             /* REACTION-FIELD ELECTROSTATICS */
1667             felec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1668
1669             fscal            = felec;
1670
1671             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1672
1673             /* Calculate temporary vectorial force */
1674             tx               = _mm_mul_pd(fscal,dx10);
1675             ty               = _mm_mul_pd(fscal,dy10);
1676             tz               = _mm_mul_pd(fscal,dz10);
1677
1678             /* Update vectorial force */
1679             fix1             = _mm_add_pd(fix1,tx);
1680             fiy1             = _mm_add_pd(fiy1,ty);
1681             fiz1             = _mm_add_pd(fiz1,tz);
1682
1683             fjx0             = _mm_add_pd(fjx0,tx);
1684             fjy0             = _mm_add_pd(fjy0,ty);
1685             fjz0             = _mm_add_pd(fjz0,tz);
1686
1687             /**************************
1688              * CALCULATE INTERACTIONS *
1689              **************************/
1690
1691             /* REACTION-FIELD ELECTROSTATICS */
1692             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1693
1694             fscal            = felec;
1695
1696             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1697
1698             /* Calculate temporary vectorial force */
1699             tx               = _mm_mul_pd(fscal,dx11);
1700             ty               = _mm_mul_pd(fscal,dy11);
1701             tz               = _mm_mul_pd(fscal,dz11);
1702
1703             /* Update vectorial force */
1704             fix1             = _mm_add_pd(fix1,tx);
1705             fiy1             = _mm_add_pd(fiy1,ty);
1706             fiz1             = _mm_add_pd(fiz1,tz);
1707
1708             fjx1             = _mm_add_pd(fjx1,tx);
1709             fjy1             = _mm_add_pd(fjy1,ty);
1710             fjz1             = _mm_add_pd(fjz1,tz);
1711
1712             /**************************
1713              * CALCULATE INTERACTIONS *
1714              **************************/
1715
1716             /* REACTION-FIELD ELECTROSTATICS */
1717             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1718
1719             fscal            = felec;
1720
1721             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1722
1723             /* Calculate temporary vectorial force */
1724             tx               = _mm_mul_pd(fscal,dx12);
1725             ty               = _mm_mul_pd(fscal,dy12);
1726             tz               = _mm_mul_pd(fscal,dz12);
1727
1728             /* Update vectorial force */
1729             fix1             = _mm_add_pd(fix1,tx);
1730             fiy1             = _mm_add_pd(fiy1,ty);
1731             fiz1             = _mm_add_pd(fiz1,tz);
1732
1733             fjx2             = _mm_add_pd(fjx2,tx);
1734             fjy2             = _mm_add_pd(fjy2,ty);
1735             fjz2             = _mm_add_pd(fjz2,tz);
1736
1737             /**************************
1738              * CALCULATE INTERACTIONS *
1739              **************************/
1740
1741             /* REACTION-FIELD ELECTROSTATICS */
1742             felec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1743
1744             fscal            = felec;
1745
1746             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1747
1748             /* Calculate temporary vectorial force */
1749             tx               = _mm_mul_pd(fscal,dx20);
1750             ty               = _mm_mul_pd(fscal,dy20);
1751             tz               = _mm_mul_pd(fscal,dz20);
1752
1753             /* Update vectorial force */
1754             fix2             = _mm_add_pd(fix2,tx);
1755             fiy2             = _mm_add_pd(fiy2,ty);
1756             fiz2             = _mm_add_pd(fiz2,tz);
1757
1758             fjx0             = _mm_add_pd(fjx0,tx);
1759             fjy0             = _mm_add_pd(fjy0,ty);
1760             fjz0             = _mm_add_pd(fjz0,tz);
1761
1762             /**************************
1763              * CALCULATE INTERACTIONS *
1764              **************************/
1765
1766             /* REACTION-FIELD ELECTROSTATICS */
1767             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1768
1769             fscal            = felec;
1770
1771             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1772
1773             /* Calculate temporary vectorial force */
1774             tx               = _mm_mul_pd(fscal,dx21);
1775             ty               = _mm_mul_pd(fscal,dy21);
1776             tz               = _mm_mul_pd(fscal,dz21);
1777
1778             /* Update vectorial force */
1779             fix2             = _mm_add_pd(fix2,tx);
1780             fiy2             = _mm_add_pd(fiy2,ty);
1781             fiz2             = _mm_add_pd(fiz2,tz);
1782
1783             fjx1             = _mm_add_pd(fjx1,tx);
1784             fjy1             = _mm_add_pd(fjy1,ty);
1785             fjz1             = _mm_add_pd(fjz1,tz);
1786
1787             /**************************
1788              * CALCULATE INTERACTIONS *
1789              **************************/
1790
1791             /* REACTION-FIELD ELECTROSTATICS */
1792             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1793
1794             fscal            = felec;
1795
1796             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1797
1798             /* Calculate temporary vectorial force */
1799             tx               = _mm_mul_pd(fscal,dx22);
1800             ty               = _mm_mul_pd(fscal,dy22);
1801             tz               = _mm_mul_pd(fscal,dz22);
1802
1803             /* Update vectorial force */
1804             fix2             = _mm_add_pd(fix2,tx);
1805             fiy2             = _mm_add_pd(fiy2,ty);
1806             fiz2             = _mm_add_pd(fiz2,tz);
1807
1808             fjx2             = _mm_add_pd(fjx2,tx);
1809             fjy2             = _mm_add_pd(fjy2,ty);
1810             fjz2             = _mm_add_pd(fjz2,tz);
1811
1812             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1813
1814             /* Inner loop uses 270 flops */
1815         }
1816
1817         /* End of innermost loop */
1818
1819         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1820                                               f+i_coord_offset,fshift+i_shift_offset);
1821
1822         /* Increment number of inner iterations */
1823         inneriter                  += j_index_end - j_index_start;
1824
1825         /* Outer loop uses 18 flops */
1826     }
1827
1828     /* Increment number of outer iterations */
1829     outeriter        += nri;
1830
1831     /* Update outer/inner flops */
1832
1833     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*270);
1834 }