31c21afd786abede9c28b4f49e5aeee88f50654c
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sse4_1_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_double
52  * Electrostatics interaction: Coulomb
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB;
75     int              j_coord_offsetA,j_coord_offsetB;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80     int              vdwioffset0;
81     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82     int              vdwioffset1;
83     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwjidx0A,vdwjidx0B;
87     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88     int              vdwjidx1A,vdwjidx1B;
89     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90     int              vdwjidx2A,vdwjidx2B;
91     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
102     real             *charge;
103     int              nvdwtype;
104     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
105     int              *vdwtype;
106     real             *vdwparam;
107     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
108     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
109     __m128i          vfitab;
110     __m128i          ifour       = _mm_set1_epi32(4);
111     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
112     real             *vftab;
113     __m128d          dummy_mask,cutoff_mask;
114     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115     __m128d          one     = _mm_set1_pd(1.0);
116     __m128d          two     = _mm_set1_pd(2.0);
117     x                = xx[0];
118     f                = ff[0];
119
120     nri              = nlist->nri;
121     iinr             = nlist->iinr;
122     jindex           = nlist->jindex;
123     jjnr             = nlist->jjnr;
124     shiftidx         = nlist->shift;
125     gid              = nlist->gid;
126     shiftvec         = fr->shift_vec[0];
127     fshift           = fr->fshift[0];
128     facel            = _mm_set1_pd(fr->epsfac);
129     charge           = mdatoms->chargeA;
130     nvdwtype         = fr->ntype;
131     vdwparam         = fr->nbfp;
132     vdwtype          = mdatoms->typeA;
133
134     vftab            = kernel_data->table_vdw->data;
135     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
136
137     /* Setup water-specific parameters */
138     inr              = nlist->iinr[0];
139     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
140     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
141     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
142     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
143
144     jq0              = _mm_set1_pd(charge[inr+0]);
145     jq1              = _mm_set1_pd(charge[inr+1]);
146     jq2              = _mm_set1_pd(charge[inr+2]);
147     vdwjidx0A        = 2*vdwtype[inr+0];
148     qq00             = _mm_mul_pd(iq0,jq0);
149     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
150     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
151     qq01             = _mm_mul_pd(iq0,jq1);
152     qq02             = _mm_mul_pd(iq0,jq2);
153     qq10             = _mm_mul_pd(iq1,jq0);
154     qq11             = _mm_mul_pd(iq1,jq1);
155     qq12             = _mm_mul_pd(iq1,jq2);
156     qq20             = _mm_mul_pd(iq2,jq0);
157     qq21             = _mm_mul_pd(iq2,jq1);
158     qq22             = _mm_mul_pd(iq2,jq2);
159
160     /* Avoid stupid compiler warnings */
161     jnrA = jnrB = 0;
162     j_coord_offsetA = 0;
163     j_coord_offsetB = 0;
164
165     outeriter        = 0;
166     inneriter        = 0;
167
168     /* Start outer loop over neighborlists */
169     for(iidx=0; iidx<nri; iidx++)
170     {
171         /* Load shift vector for this list */
172         i_shift_offset   = DIM*shiftidx[iidx];
173
174         /* Load limits for loop over neighbors */
175         j_index_start    = jindex[iidx];
176         j_index_end      = jindex[iidx+1];
177
178         /* Get outer coordinate index */
179         inr              = iinr[iidx];
180         i_coord_offset   = DIM*inr;
181
182         /* Load i particle coords and add shift vector */
183         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
184                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
185
186         fix0             = _mm_setzero_pd();
187         fiy0             = _mm_setzero_pd();
188         fiz0             = _mm_setzero_pd();
189         fix1             = _mm_setzero_pd();
190         fiy1             = _mm_setzero_pd();
191         fiz1             = _mm_setzero_pd();
192         fix2             = _mm_setzero_pd();
193         fiy2             = _mm_setzero_pd();
194         fiz2             = _mm_setzero_pd();
195
196         /* Reset potential sums */
197         velecsum         = _mm_setzero_pd();
198         vvdwsum          = _mm_setzero_pd();
199
200         /* Start inner kernel loop */
201         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
202         {
203
204             /* Get j neighbor index, and coordinate index */
205             jnrA             = jjnr[jidx];
206             jnrB             = jjnr[jidx+1];
207             j_coord_offsetA  = DIM*jnrA;
208             j_coord_offsetB  = DIM*jnrB;
209
210             /* load j atom coordinates */
211             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
212                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
213
214             /* Calculate displacement vector */
215             dx00             = _mm_sub_pd(ix0,jx0);
216             dy00             = _mm_sub_pd(iy0,jy0);
217             dz00             = _mm_sub_pd(iz0,jz0);
218             dx01             = _mm_sub_pd(ix0,jx1);
219             dy01             = _mm_sub_pd(iy0,jy1);
220             dz01             = _mm_sub_pd(iz0,jz1);
221             dx02             = _mm_sub_pd(ix0,jx2);
222             dy02             = _mm_sub_pd(iy0,jy2);
223             dz02             = _mm_sub_pd(iz0,jz2);
224             dx10             = _mm_sub_pd(ix1,jx0);
225             dy10             = _mm_sub_pd(iy1,jy0);
226             dz10             = _mm_sub_pd(iz1,jz0);
227             dx11             = _mm_sub_pd(ix1,jx1);
228             dy11             = _mm_sub_pd(iy1,jy1);
229             dz11             = _mm_sub_pd(iz1,jz1);
230             dx12             = _mm_sub_pd(ix1,jx2);
231             dy12             = _mm_sub_pd(iy1,jy2);
232             dz12             = _mm_sub_pd(iz1,jz2);
233             dx20             = _mm_sub_pd(ix2,jx0);
234             dy20             = _mm_sub_pd(iy2,jy0);
235             dz20             = _mm_sub_pd(iz2,jz0);
236             dx21             = _mm_sub_pd(ix2,jx1);
237             dy21             = _mm_sub_pd(iy2,jy1);
238             dz21             = _mm_sub_pd(iz2,jz1);
239             dx22             = _mm_sub_pd(ix2,jx2);
240             dy22             = _mm_sub_pd(iy2,jy2);
241             dz22             = _mm_sub_pd(iz2,jz2);
242
243             /* Calculate squared distance and things based on it */
244             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
245             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
246             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
247             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
248             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
249             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
250             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
251             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
252             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
253
254             rinv00           = gmx_mm_invsqrt_pd(rsq00);
255             rinv01           = gmx_mm_invsqrt_pd(rsq01);
256             rinv02           = gmx_mm_invsqrt_pd(rsq02);
257             rinv10           = gmx_mm_invsqrt_pd(rsq10);
258             rinv11           = gmx_mm_invsqrt_pd(rsq11);
259             rinv12           = gmx_mm_invsqrt_pd(rsq12);
260             rinv20           = gmx_mm_invsqrt_pd(rsq20);
261             rinv21           = gmx_mm_invsqrt_pd(rsq21);
262             rinv22           = gmx_mm_invsqrt_pd(rsq22);
263
264             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
265             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
266             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
267             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
268             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
269             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
270             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
271             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
272             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
273
274             fjx0             = _mm_setzero_pd();
275             fjy0             = _mm_setzero_pd();
276             fjz0             = _mm_setzero_pd();
277             fjx1             = _mm_setzero_pd();
278             fjy1             = _mm_setzero_pd();
279             fjz1             = _mm_setzero_pd();
280             fjx2             = _mm_setzero_pd();
281             fjy2             = _mm_setzero_pd();
282             fjz2             = _mm_setzero_pd();
283
284             /**************************
285              * CALCULATE INTERACTIONS *
286              **************************/
287
288             r00              = _mm_mul_pd(rsq00,rinv00);
289
290             /* Calculate table index by multiplying r with table scale and truncate to integer */
291             rt               = _mm_mul_pd(r00,vftabscale);
292             vfitab           = _mm_cvttpd_epi32(rt);
293             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
294             vfitab           = _mm_slli_epi32(vfitab,3);
295
296             /* COULOMB ELECTROSTATICS */
297             velec            = _mm_mul_pd(qq00,rinv00);
298             felec            = _mm_mul_pd(velec,rinvsq00);
299
300             /* CUBIC SPLINE TABLE DISPERSION */
301             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
302             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
303             GMX_MM_TRANSPOSE2_PD(Y,F);
304             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
305             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
306             GMX_MM_TRANSPOSE2_PD(G,H);
307             Heps             = _mm_mul_pd(vfeps,H);
308             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
309             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
310             vvdw6            = _mm_mul_pd(c6_00,VV);
311             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
312             fvdw6            = _mm_mul_pd(c6_00,FF);
313
314             /* CUBIC SPLINE TABLE REPULSION */
315             vfitab           = _mm_add_epi32(vfitab,ifour);
316             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
317             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
318             GMX_MM_TRANSPOSE2_PD(Y,F);
319             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
320             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
321             GMX_MM_TRANSPOSE2_PD(G,H);
322             Heps             = _mm_mul_pd(vfeps,H);
323             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
324             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
325             vvdw12           = _mm_mul_pd(c12_00,VV);
326             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
327             fvdw12           = _mm_mul_pd(c12_00,FF);
328             vvdw             = _mm_add_pd(vvdw12,vvdw6);
329             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
330
331             /* Update potential sum for this i atom from the interaction with this j atom. */
332             velecsum         = _mm_add_pd(velecsum,velec);
333             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
334
335             fscal            = _mm_add_pd(felec,fvdw);
336
337             /* Calculate temporary vectorial force */
338             tx               = _mm_mul_pd(fscal,dx00);
339             ty               = _mm_mul_pd(fscal,dy00);
340             tz               = _mm_mul_pd(fscal,dz00);
341
342             /* Update vectorial force */
343             fix0             = _mm_add_pd(fix0,tx);
344             fiy0             = _mm_add_pd(fiy0,ty);
345             fiz0             = _mm_add_pd(fiz0,tz);
346
347             fjx0             = _mm_add_pd(fjx0,tx);
348             fjy0             = _mm_add_pd(fjy0,ty);
349             fjz0             = _mm_add_pd(fjz0,tz);
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             /* COULOMB ELECTROSTATICS */
356             velec            = _mm_mul_pd(qq01,rinv01);
357             felec            = _mm_mul_pd(velec,rinvsq01);
358
359             /* Update potential sum for this i atom from the interaction with this j atom. */
360             velecsum         = _mm_add_pd(velecsum,velec);
361
362             fscal            = felec;
363
364             /* Calculate temporary vectorial force */
365             tx               = _mm_mul_pd(fscal,dx01);
366             ty               = _mm_mul_pd(fscal,dy01);
367             tz               = _mm_mul_pd(fscal,dz01);
368
369             /* Update vectorial force */
370             fix0             = _mm_add_pd(fix0,tx);
371             fiy0             = _mm_add_pd(fiy0,ty);
372             fiz0             = _mm_add_pd(fiz0,tz);
373
374             fjx1             = _mm_add_pd(fjx1,tx);
375             fjy1             = _mm_add_pd(fjy1,ty);
376             fjz1             = _mm_add_pd(fjz1,tz);
377
378             /**************************
379              * CALCULATE INTERACTIONS *
380              **************************/
381
382             /* COULOMB ELECTROSTATICS */
383             velec            = _mm_mul_pd(qq02,rinv02);
384             felec            = _mm_mul_pd(velec,rinvsq02);
385
386             /* Update potential sum for this i atom from the interaction with this j atom. */
387             velecsum         = _mm_add_pd(velecsum,velec);
388
389             fscal            = felec;
390
391             /* Calculate temporary vectorial force */
392             tx               = _mm_mul_pd(fscal,dx02);
393             ty               = _mm_mul_pd(fscal,dy02);
394             tz               = _mm_mul_pd(fscal,dz02);
395
396             /* Update vectorial force */
397             fix0             = _mm_add_pd(fix0,tx);
398             fiy0             = _mm_add_pd(fiy0,ty);
399             fiz0             = _mm_add_pd(fiz0,tz);
400
401             fjx2             = _mm_add_pd(fjx2,tx);
402             fjy2             = _mm_add_pd(fjy2,ty);
403             fjz2             = _mm_add_pd(fjz2,tz);
404
405             /**************************
406              * CALCULATE INTERACTIONS *
407              **************************/
408
409             /* COULOMB ELECTROSTATICS */
410             velec            = _mm_mul_pd(qq10,rinv10);
411             felec            = _mm_mul_pd(velec,rinvsq10);
412
413             /* Update potential sum for this i atom from the interaction with this j atom. */
414             velecsum         = _mm_add_pd(velecsum,velec);
415
416             fscal            = felec;
417
418             /* Calculate temporary vectorial force */
419             tx               = _mm_mul_pd(fscal,dx10);
420             ty               = _mm_mul_pd(fscal,dy10);
421             tz               = _mm_mul_pd(fscal,dz10);
422
423             /* Update vectorial force */
424             fix1             = _mm_add_pd(fix1,tx);
425             fiy1             = _mm_add_pd(fiy1,ty);
426             fiz1             = _mm_add_pd(fiz1,tz);
427
428             fjx0             = _mm_add_pd(fjx0,tx);
429             fjy0             = _mm_add_pd(fjy0,ty);
430             fjz0             = _mm_add_pd(fjz0,tz);
431
432             /**************************
433              * CALCULATE INTERACTIONS *
434              **************************/
435
436             /* COULOMB ELECTROSTATICS */
437             velec            = _mm_mul_pd(qq11,rinv11);
438             felec            = _mm_mul_pd(velec,rinvsq11);
439
440             /* Update potential sum for this i atom from the interaction with this j atom. */
441             velecsum         = _mm_add_pd(velecsum,velec);
442
443             fscal            = felec;
444
445             /* Calculate temporary vectorial force */
446             tx               = _mm_mul_pd(fscal,dx11);
447             ty               = _mm_mul_pd(fscal,dy11);
448             tz               = _mm_mul_pd(fscal,dz11);
449
450             /* Update vectorial force */
451             fix1             = _mm_add_pd(fix1,tx);
452             fiy1             = _mm_add_pd(fiy1,ty);
453             fiz1             = _mm_add_pd(fiz1,tz);
454
455             fjx1             = _mm_add_pd(fjx1,tx);
456             fjy1             = _mm_add_pd(fjy1,ty);
457             fjz1             = _mm_add_pd(fjz1,tz);
458
459             /**************************
460              * CALCULATE INTERACTIONS *
461              **************************/
462
463             /* COULOMB ELECTROSTATICS */
464             velec            = _mm_mul_pd(qq12,rinv12);
465             felec            = _mm_mul_pd(velec,rinvsq12);
466
467             /* Update potential sum for this i atom from the interaction with this j atom. */
468             velecsum         = _mm_add_pd(velecsum,velec);
469
470             fscal            = felec;
471
472             /* Calculate temporary vectorial force */
473             tx               = _mm_mul_pd(fscal,dx12);
474             ty               = _mm_mul_pd(fscal,dy12);
475             tz               = _mm_mul_pd(fscal,dz12);
476
477             /* Update vectorial force */
478             fix1             = _mm_add_pd(fix1,tx);
479             fiy1             = _mm_add_pd(fiy1,ty);
480             fiz1             = _mm_add_pd(fiz1,tz);
481
482             fjx2             = _mm_add_pd(fjx2,tx);
483             fjy2             = _mm_add_pd(fjy2,ty);
484             fjz2             = _mm_add_pd(fjz2,tz);
485
486             /**************************
487              * CALCULATE INTERACTIONS *
488              **************************/
489
490             /* COULOMB ELECTROSTATICS */
491             velec            = _mm_mul_pd(qq20,rinv20);
492             felec            = _mm_mul_pd(velec,rinvsq20);
493
494             /* Update potential sum for this i atom from the interaction with this j atom. */
495             velecsum         = _mm_add_pd(velecsum,velec);
496
497             fscal            = felec;
498
499             /* Calculate temporary vectorial force */
500             tx               = _mm_mul_pd(fscal,dx20);
501             ty               = _mm_mul_pd(fscal,dy20);
502             tz               = _mm_mul_pd(fscal,dz20);
503
504             /* Update vectorial force */
505             fix2             = _mm_add_pd(fix2,tx);
506             fiy2             = _mm_add_pd(fiy2,ty);
507             fiz2             = _mm_add_pd(fiz2,tz);
508
509             fjx0             = _mm_add_pd(fjx0,tx);
510             fjy0             = _mm_add_pd(fjy0,ty);
511             fjz0             = _mm_add_pd(fjz0,tz);
512
513             /**************************
514              * CALCULATE INTERACTIONS *
515              **************************/
516
517             /* COULOMB ELECTROSTATICS */
518             velec            = _mm_mul_pd(qq21,rinv21);
519             felec            = _mm_mul_pd(velec,rinvsq21);
520
521             /* Update potential sum for this i atom from the interaction with this j atom. */
522             velecsum         = _mm_add_pd(velecsum,velec);
523
524             fscal            = felec;
525
526             /* Calculate temporary vectorial force */
527             tx               = _mm_mul_pd(fscal,dx21);
528             ty               = _mm_mul_pd(fscal,dy21);
529             tz               = _mm_mul_pd(fscal,dz21);
530
531             /* Update vectorial force */
532             fix2             = _mm_add_pd(fix2,tx);
533             fiy2             = _mm_add_pd(fiy2,ty);
534             fiz2             = _mm_add_pd(fiz2,tz);
535
536             fjx1             = _mm_add_pd(fjx1,tx);
537             fjy1             = _mm_add_pd(fjy1,ty);
538             fjz1             = _mm_add_pd(fjz1,tz);
539
540             /**************************
541              * CALCULATE INTERACTIONS *
542              **************************/
543
544             /* COULOMB ELECTROSTATICS */
545             velec            = _mm_mul_pd(qq22,rinv22);
546             felec            = _mm_mul_pd(velec,rinvsq22);
547
548             /* Update potential sum for this i atom from the interaction with this j atom. */
549             velecsum         = _mm_add_pd(velecsum,velec);
550
551             fscal            = felec;
552
553             /* Calculate temporary vectorial force */
554             tx               = _mm_mul_pd(fscal,dx22);
555             ty               = _mm_mul_pd(fscal,dy22);
556             tz               = _mm_mul_pd(fscal,dz22);
557
558             /* Update vectorial force */
559             fix2             = _mm_add_pd(fix2,tx);
560             fiy2             = _mm_add_pd(fiy2,ty);
561             fiz2             = _mm_add_pd(fiz2,tz);
562
563             fjx2             = _mm_add_pd(fjx2,tx);
564             fjy2             = _mm_add_pd(fjy2,ty);
565             fjz2             = _mm_add_pd(fjz2,tz);
566
567             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
568
569             /* Inner loop uses 287 flops */
570         }
571
572         if(jidx<j_index_end)
573         {
574
575             jnrA             = jjnr[jidx];
576             j_coord_offsetA  = DIM*jnrA;
577
578             /* load j atom coordinates */
579             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
580                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
581
582             /* Calculate displacement vector */
583             dx00             = _mm_sub_pd(ix0,jx0);
584             dy00             = _mm_sub_pd(iy0,jy0);
585             dz00             = _mm_sub_pd(iz0,jz0);
586             dx01             = _mm_sub_pd(ix0,jx1);
587             dy01             = _mm_sub_pd(iy0,jy1);
588             dz01             = _mm_sub_pd(iz0,jz1);
589             dx02             = _mm_sub_pd(ix0,jx2);
590             dy02             = _mm_sub_pd(iy0,jy2);
591             dz02             = _mm_sub_pd(iz0,jz2);
592             dx10             = _mm_sub_pd(ix1,jx0);
593             dy10             = _mm_sub_pd(iy1,jy0);
594             dz10             = _mm_sub_pd(iz1,jz0);
595             dx11             = _mm_sub_pd(ix1,jx1);
596             dy11             = _mm_sub_pd(iy1,jy1);
597             dz11             = _mm_sub_pd(iz1,jz1);
598             dx12             = _mm_sub_pd(ix1,jx2);
599             dy12             = _mm_sub_pd(iy1,jy2);
600             dz12             = _mm_sub_pd(iz1,jz2);
601             dx20             = _mm_sub_pd(ix2,jx0);
602             dy20             = _mm_sub_pd(iy2,jy0);
603             dz20             = _mm_sub_pd(iz2,jz0);
604             dx21             = _mm_sub_pd(ix2,jx1);
605             dy21             = _mm_sub_pd(iy2,jy1);
606             dz21             = _mm_sub_pd(iz2,jz1);
607             dx22             = _mm_sub_pd(ix2,jx2);
608             dy22             = _mm_sub_pd(iy2,jy2);
609             dz22             = _mm_sub_pd(iz2,jz2);
610
611             /* Calculate squared distance and things based on it */
612             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
613             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
614             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
615             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
616             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
617             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
618             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
619             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
620             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
621
622             rinv00           = gmx_mm_invsqrt_pd(rsq00);
623             rinv01           = gmx_mm_invsqrt_pd(rsq01);
624             rinv02           = gmx_mm_invsqrt_pd(rsq02);
625             rinv10           = gmx_mm_invsqrt_pd(rsq10);
626             rinv11           = gmx_mm_invsqrt_pd(rsq11);
627             rinv12           = gmx_mm_invsqrt_pd(rsq12);
628             rinv20           = gmx_mm_invsqrt_pd(rsq20);
629             rinv21           = gmx_mm_invsqrt_pd(rsq21);
630             rinv22           = gmx_mm_invsqrt_pd(rsq22);
631
632             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
633             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
634             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
635             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
636             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
637             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
638             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
639             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
640             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
641
642             fjx0             = _mm_setzero_pd();
643             fjy0             = _mm_setzero_pd();
644             fjz0             = _mm_setzero_pd();
645             fjx1             = _mm_setzero_pd();
646             fjy1             = _mm_setzero_pd();
647             fjz1             = _mm_setzero_pd();
648             fjx2             = _mm_setzero_pd();
649             fjy2             = _mm_setzero_pd();
650             fjz2             = _mm_setzero_pd();
651
652             /**************************
653              * CALCULATE INTERACTIONS *
654              **************************/
655
656             r00              = _mm_mul_pd(rsq00,rinv00);
657
658             /* Calculate table index by multiplying r with table scale and truncate to integer */
659             rt               = _mm_mul_pd(r00,vftabscale);
660             vfitab           = _mm_cvttpd_epi32(rt);
661             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
662             vfitab           = _mm_slli_epi32(vfitab,3);
663
664             /* COULOMB ELECTROSTATICS */
665             velec            = _mm_mul_pd(qq00,rinv00);
666             felec            = _mm_mul_pd(velec,rinvsq00);
667
668             /* CUBIC SPLINE TABLE DISPERSION */
669             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
670             F                = _mm_setzero_pd();
671             GMX_MM_TRANSPOSE2_PD(Y,F);
672             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
673             H                = _mm_setzero_pd();
674             GMX_MM_TRANSPOSE2_PD(G,H);
675             Heps             = _mm_mul_pd(vfeps,H);
676             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
677             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
678             vvdw6            = _mm_mul_pd(c6_00,VV);
679             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
680             fvdw6            = _mm_mul_pd(c6_00,FF);
681
682             /* CUBIC SPLINE TABLE REPULSION */
683             vfitab           = _mm_add_epi32(vfitab,ifour);
684             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
685             F                = _mm_setzero_pd();
686             GMX_MM_TRANSPOSE2_PD(Y,F);
687             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
688             H                = _mm_setzero_pd();
689             GMX_MM_TRANSPOSE2_PD(G,H);
690             Heps             = _mm_mul_pd(vfeps,H);
691             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
692             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
693             vvdw12           = _mm_mul_pd(c12_00,VV);
694             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
695             fvdw12           = _mm_mul_pd(c12_00,FF);
696             vvdw             = _mm_add_pd(vvdw12,vvdw6);
697             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
698
699             /* Update potential sum for this i atom from the interaction with this j atom. */
700             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
701             velecsum         = _mm_add_pd(velecsum,velec);
702             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
703             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
704
705             fscal            = _mm_add_pd(felec,fvdw);
706
707             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
708
709             /* Calculate temporary vectorial force */
710             tx               = _mm_mul_pd(fscal,dx00);
711             ty               = _mm_mul_pd(fscal,dy00);
712             tz               = _mm_mul_pd(fscal,dz00);
713
714             /* Update vectorial force */
715             fix0             = _mm_add_pd(fix0,tx);
716             fiy0             = _mm_add_pd(fiy0,ty);
717             fiz0             = _mm_add_pd(fiz0,tz);
718
719             fjx0             = _mm_add_pd(fjx0,tx);
720             fjy0             = _mm_add_pd(fjy0,ty);
721             fjz0             = _mm_add_pd(fjz0,tz);
722
723             /**************************
724              * CALCULATE INTERACTIONS *
725              **************************/
726
727             /* COULOMB ELECTROSTATICS */
728             velec            = _mm_mul_pd(qq01,rinv01);
729             felec            = _mm_mul_pd(velec,rinvsq01);
730
731             /* Update potential sum for this i atom from the interaction with this j atom. */
732             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
733             velecsum         = _mm_add_pd(velecsum,velec);
734
735             fscal            = felec;
736
737             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
738
739             /* Calculate temporary vectorial force */
740             tx               = _mm_mul_pd(fscal,dx01);
741             ty               = _mm_mul_pd(fscal,dy01);
742             tz               = _mm_mul_pd(fscal,dz01);
743
744             /* Update vectorial force */
745             fix0             = _mm_add_pd(fix0,tx);
746             fiy0             = _mm_add_pd(fiy0,ty);
747             fiz0             = _mm_add_pd(fiz0,tz);
748
749             fjx1             = _mm_add_pd(fjx1,tx);
750             fjy1             = _mm_add_pd(fjy1,ty);
751             fjz1             = _mm_add_pd(fjz1,tz);
752
753             /**************************
754              * CALCULATE INTERACTIONS *
755              **************************/
756
757             /* COULOMB ELECTROSTATICS */
758             velec            = _mm_mul_pd(qq02,rinv02);
759             felec            = _mm_mul_pd(velec,rinvsq02);
760
761             /* Update potential sum for this i atom from the interaction with this j atom. */
762             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
763             velecsum         = _mm_add_pd(velecsum,velec);
764
765             fscal            = felec;
766
767             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
768
769             /* Calculate temporary vectorial force */
770             tx               = _mm_mul_pd(fscal,dx02);
771             ty               = _mm_mul_pd(fscal,dy02);
772             tz               = _mm_mul_pd(fscal,dz02);
773
774             /* Update vectorial force */
775             fix0             = _mm_add_pd(fix0,tx);
776             fiy0             = _mm_add_pd(fiy0,ty);
777             fiz0             = _mm_add_pd(fiz0,tz);
778
779             fjx2             = _mm_add_pd(fjx2,tx);
780             fjy2             = _mm_add_pd(fjy2,ty);
781             fjz2             = _mm_add_pd(fjz2,tz);
782
783             /**************************
784              * CALCULATE INTERACTIONS *
785              **************************/
786
787             /* COULOMB ELECTROSTATICS */
788             velec            = _mm_mul_pd(qq10,rinv10);
789             felec            = _mm_mul_pd(velec,rinvsq10);
790
791             /* Update potential sum for this i atom from the interaction with this j atom. */
792             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
793             velecsum         = _mm_add_pd(velecsum,velec);
794
795             fscal            = felec;
796
797             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
798
799             /* Calculate temporary vectorial force */
800             tx               = _mm_mul_pd(fscal,dx10);
801             ty               = _mm_mul_pd(fscal,dy10);
802             tz               = _mm_mul_pd(fscal,dz10);
803
804             /* Update vectorial force */
805             fix1             = _mm_add_pd(fix1,tx);
806             fiy1             = _mm_add_pd(fiy1,ty);
807             fiz1             = _mm_add_pd(fiz1,tz);
808
809             fjx0             = _mm_add_pd(fjx0,tx);
810             fjy0             = _mm_add_pd(fjy0,ty);
811             fjz0             = _mm_add_pd(fjz0,tz);
812
813             /**************************
814              * CALCULATE INTERACTIONS *
815              **************************/
816
817             /* COULOMB ELECTROSTATICS */
818             velec            = _mm_mul_pd(qq11,rinv11);
819             felec            = _mm_mul_pd(velec,rinvsq11);
820
821             /* Update potential sum for this i atom from the interaction with this j atom. */
822             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
823             velecsum         = _mm_add_pd(velecsum,velec);
824
825             fscal            = felec;
826
827             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
828
829             /* Calculate temporary vectorial force */
830             tx               = _mm_mul_pd(fscal,dx11);
831             ty               = _mm_mul_pd(fscal,dy11);
832             tz               = _mm_mul_pd(fscal,dz11);
833
834             /* Update vectorial force */
835             fix1             = _mm_add_pd(fix1,tx);
836             fiy1             = _mm_add_pd(fiy1,ty);
837             fiz1             = _mm_add_pd(fiz1,tz);
838
839             fjx1             = _mm_add_pd(fjx1,tx);
840             fjy1             = _mm_add_pd(fjy1,ty);
841             fjz1             = _mm_add_pd(fjz1,tz);
842
843             /**************************
844              * CALCULATE INTERACTIONS *
845              **************************/
846
847             /* COULOMB ELECTROSTATICS */
848             velec            = _mm_mul_pd(qq12,rinv12);
849             felec            = _mm_mul_pd(velec,rinvsq12);
850
851             /* Update potential sum for this i atom from the interaction with this j atom. */
852             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
853             velecsum         = _mm_add_pd(velecsum,velec);
854
855             fscal            = felec;
856
857             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
858
859             /* Calculate temporary vectorial force */
860             tx               = _mm_mul_pd(fscal,dx12);
861             ty               = _mm_mul_pd(fscal,dy12);
862             tz               = _mm_mul_pd(fscal,dz12);
863
864             /* Update vectorial force */
865             fix1             = _mm_add_pd(fix1,tx);
866             fiy1             = _mm_add_pd(fiy1,ty);
867             fiz1             = _mm_add_pd(fiz1,tz);
868
869             fjx2             = _mm_add_pd(fjx2,tx);
870             fjy2             = _mm_add_pd(fjy2,ty);
871             fjz2             = _mm_add_pd(fjz2,tz);
872
873             /**************************
874              * CALCULATE INTERACTIONS *
875              **************************/
876
877             /* COULOMB ELECTROSTATICS */
878             velec            = _mm_mul_pd(qq20,rinv20);
879             felec            = _mm_mul_pd(velec,rinvsq20);
880
881             /* Update potential sum for this i atom from the interaction with this j atom. */
882             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
883             velecsum         = _mm_add_pd(velecsum,velec);
884
885             fscal            = felec;
886
887             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
888
889             /* Calculate temporary vectorial force */
890             tx               = _mm_mul_pd(fscal,dx20);
891             ty               = _mm_mul_pd(fscal,dy20);
892             tz               = _mm_mul_pd(fscal,dz20);
893
894             /* Update vectorial force */
895             fix2             = _mm_add_pd(fix2,tx);
896             fiy2             = _mm_add_pd(fiy2,ty);
897             fiz2             = _mm_add_pd(fiz2,tz);
898
899             fjx0             = _mm_add_pd(fjx0,tx);
900             fjy0             = _mm_add_pd(fjy0,ty);
901             fjz0             = _mm_add_pd(fjz0,tz);
902
903             /**************************
904              * CALCULATE INTERACTIONS *
905              **************************/
906
907             /* COULOMB ELECTROSTATICS */
908             velec            = _mm_mul_pd(qq21,rinv21);
909             felec            = _mm_mul_pd(velec,rinvsq21);
910
911             /* Update potential sum for this i atom from the interaction with this j atom. */
912             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
913             velecsum         = _mm_add_pd(velecsum,velec);
914
915             fscal            = felec;
916
917             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
918
919             /* Calculate temporary vectorial force */
920             tx               = _mm_mul_pd(fscal,dx21);
921             ty               = _mm_mul_pd(fscal,dy21);
922             tz               = _mm_mul_pd(fscal,dz21);
923
924             /* Update vectorial force */
925             fix2             = _mm_add_pd(fix2,tx);
926             fiy2             = _mm_add_pd(fiy2,ty);
927             fiz2             = _mm_add_pd(fiz2,tz);
928
929             fjx1             = _mm_add_pd(fjx1,tx);
930             fjy1             = _mm_add_pd(fjy1,ty);
931             fjz1             = _mm_add_pd(fjz1,tz);
932
933             /**************************
934              * CALCULATE INTERACTIONS *
935              **************************/
936
937             /* COULOMB ELECTROSTATICS */
938             velec            = _mm_mul_pd(qq22,rinv22);
939             felec            = _mm_mul_pd(velec,rinvsq22);
940
941             /* Update potential sum for this i atom from the interaction with this j atom. */
942             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
943             velecsum         = _mm_add_pd(velecsum,velec);
944
945             fscal            = felec;
946
947             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
948
949             /* Calculate temporary vectorial force */
950             tx               = _mm_mul_pd(fscal,dx22);
951             ty               = _mm_mul_pd(fscal,dy22);
952             tz               = _mm_mul_pd(fscal,dz22);
953
954             /* Update vectorial force */
955             fix2             = _mm_add_pd(fix2,tx);
956             fiy2             = _mm_add_pd(fiy2,ty);
957             fiz2             = _mm_add_pd(fiz2,tz);
958
959             fjx2             = _mm_add_pd(fjx2,tx);
960             fjy2             = _mm_add_pd(fjy2,ty);
961             fjz2             = _mm_add_pd(fjz2,tz);
962
963             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
964
965             /* Inner loop uses 287 flops */
966         }
967
968         /* End of innermost loop */
969
970         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
971                                               f+i_coord_offset,fshift+i_shift_offset);
972
973         ggid                        = gid[iidx];
974         /* Update potential energies */
975         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
976         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
977
978         /* Increment number of inner iterations */
979         inneriter                  += j_index_end - j_index_start;
980
981         /* Outer loop uses 20 flops */
982     }
983
984     /* Increment number of outer iterations */
985     outeriter        += nri;
986
987     /* Update outer/inner flops */
988
989     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*287);
990 }
991 /*
992  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_double
993  * Electrostatics interaction: Coulomb
994  * VdW interaction:            CubicSplineTable
995  * Geometry:                   Water3-Water3
996  * Calculate force/pot:        Force
997  */
998 void
999 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_double
1000                     (t_nblist                    * gmx_restrict       nlist,
1001                      rvec                        * gmx_restrict          xx,
1002                      rvec                        * gmx_restrict          ff,
1003                      t_forcerec                  * gmx_restrict          fr,
1004                      t_mdatoms                   * gmx_restrict     mdatoms,
1005                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1006                      t_nrnb                      * gmx_restrict        nrnb)
1007 {
1008     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1009      * just 0 for non-waters.
1010      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1011      * jnr indices corresponding to data put in the four positions in the SIMD register.
1012      */
1013     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1014     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1015     int              jnrA,jnrB;
1016     int              j_coord_offsetA,j_coord_offsetB;
1017     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1018     real             rcutoff_scalar;
1019     real             *shiftvec,*fshift,*x,*f;
1020     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1021     int              vdwioffset0;
1022     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1023     int              vdwioffset1;
1024     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1025     int              vdwioffset2;
1026     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1027     int              vdwjidx0A,vdwjidx0B;
1028     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1029     int              vdwjidx1A,vdwjidx1B;
1030     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1031     int              vdwjidx2A,vdwjidx2B;
1032     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1033     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1034     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1035     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1036     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1037     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1038     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1039     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1040     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1041     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1042     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1043     real             *charge;
1044     int              nvdwtype;
1045     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1046     int              *vdwtype;
1047     real             *vdwparam;
1048     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1049     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1050     __m128i          vfitab;
1051     __m128i          ifour       = _mm_set1_epi32(4);
1052     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1053     real             *vftab;
1054     __m128d          dummy_mask,cutoff_mask;
1055     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1056     __m128d          one     = _mm_set1_pd(1.0);
1057     __m128d          two     = _mm_set1_pd(2.0);
1058     x                = xx[0];
1059     f                = ff[0];
1060
1061     nri              = nlist->nri;
1062     iinr             = nlist->iinr;
1063     jindex           = nlist->jindex;
1064     jjnr             = nlist->jjnr;
1065     shiftidx         = nlist->shift;
1066     gid              = nlist->gid;
1067     shiftvec         = fr->shift_vec[0];
1068     fshift           = fr->fshift[0];
1069     facel            = _mm_set1_pd(fr->epsfac);
1070     charge           = mdatoms->chargeA;
1071     nvdwtype         = fr->ntype;
1072     vdwparam         = fr->nbfp;
1073     vdwtype          = mdatoms->typeA;
1074
1075     vftab            = kernel_data->table_vdw->data;
1076     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
1077
1078     /* Setup water-specific parameters */
1079     inr              = nlist->iinr[0];
1080     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1081     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1082     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1083     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1084
1085     jq0              = _mm_set1_pd(charge[inr+0]);
1086     jq1              = _mm_set1_pd(charge[inr+1]);
1087     jq2              = _mm_set1_pd(charge[inr+2]);
1088     vdwjidx0A        = 2*vdwtype[inr+0];
1089     qq00             = _mm_mul_pd(iq0,jq0);
1090     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1091     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1092     qq01             = _mm_mul_pd(iq0,jq1);
1093     qq02             = _mm_mul_pd(iq0,jq2);
1094     qq10             = _mm_mul_pd(iq1,jq0);
1095     qq11             = _mm_mul_pd(iq1,jq1);
1096     qq12             = _mm_mul_pd(iq1,jq2);
1097     qq20             = _mm_mul_pd(iq2,jq0);
1098     qq21             = _mm_mul_pd(iq2,jq1);
1099     qq22             = _mm_mul_pd(iq2,jq2);
1100
1101     /* Avoid stupid compiler warnings */
1102     jnrA = jnrB = 0;
1103     j_coord_offsetA = 0;
1104     j_coord_offsetB = 0;
1105
1106     outeriter        = 0;
1107     inneriter        = 0;
1108
1109     /* Start outer loop over neighborlists */
1110     for(iidx=0; iidx<nri; iidx++)
1111     {
1112         /* Load shift vector for this list */
1113         i_shift_offset   = DIM*shiftidx[iidx];
1114
1115         /* Load limits for loop over neighbors */
1116         j_index_start    = jindex[iidx];
1117         j_index_end      = jindex[iidx+1];
1118
1119         /* Get outer coordinate index */
1120         inr              = iinr[iidx];
1121         i_coord_offset   = DIM*inr;
1122
1123         /* Load i particle coords and add shift vector */
1124         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1125                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1126
1127         fix0             = _mm_setzero_pd();
1128         fiy0             = _mm_setzero_pd();
1129         fiz0             = _mm_setzero_pd();
1130         fix1             = _mm_setzero_pd();
1131         fiy1             = _mm_setzero_pd();
1132         fiz1             = _mm_setzero_pd();
1133         fix2             = _mm_setzero_pd();
1134         fiy2             = _mm_setzero_pd();
1135         fiz2             = _mm_setzero_pd();
1136
1137         /* Start inner kernel loop */
1138         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1139         {
1140
1141             /* Get j neighbor index, and coordinate index */
1142             jnrA             = jjnr[jidx];
1143             jnrB             = jjnr[jidx+1];
1144             j_coord_offsetA  = DIM*jnrA;
1145             j_coord_offsetB  = DIM*jnrB;
1146
1147             /* load j atom coordinates */
1148             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1149                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1150
1151             /* Calculate displacement vector */
1152             dx00             = _mm_sub_pd(ix0,jx0);
1153             dy00             = _mm_sub_pd(iy0,jy0);
1154             dz00             = _mm_sub_pd(iz0,jz0);
1155             dx01             = _mm_sub_pd(ix0,jx1);
1156             dy01             = _mm_sub_pd(iy0,jy1);
1157             dz01             = _mm_sub_pd(iz0,jz1);
1158             dx02             = _mm_sub_pd(ix0,jx2);
1159             dy02             = _mm_sub_pd(iy0,jy2);
1160             dz02             = _mm_sub_pd(iz0,jz2);
1161             dx10             = _mm_sub_pd(ix1,jx0);
1162             dy10             = _mm_sub_pd(iy1,jy0);
1163             dz10             = _mm_sub_pd(iz1,jz0);
1164             dx11             = _mm_sub_pd(ix1,jx1);
1165             dy11             = _mm_sub_pd(iy1,jy1);
1166             dz11             = _mm_sub_pd(iz1,jz1);
1167             dx12             = _mm_sub_pd(ix1,jx2);
1168             dy12             = _mm_sub_pd(iy1,jy2);
1169             dz12             = _mm_sub_pd(iz1,jz2);
1170             dx20             = _mm_sub_pd(ix2,jx0);
1171             dy20             = _mm_sub_pd(iy2,jy0);
1172             dz20             = _mm_sub_pd(iz2,jz0);
1173             dx21             = _mm_sub_pd(ix2,jx1);
1174             dy21             = _mm_sub_pd(iy2,jy1);
1175             dz21             = _mm_sub_pd(iz2,jz1);
1176             dx22             = _mm_sub_pd(ix2,jx2);
1177             dy22             = _mm_sub_pd(iy2,jy2);
1178             dz22             = _mm_sub_pd(iz2,jz2);
1179
1180             /* Calculate squared distance and things based on it */
1181             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1182             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1183             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1184             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1185             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1186             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1187             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1188             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1189             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1190
1191             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1192             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1193             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1194             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1195             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1196             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1197             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1198             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1199             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1200
1201             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1202             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1203             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1204             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1205             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1206             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1207             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1208             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1209             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1210
1211             fjx0             = _mm_setzero_pd();
1212             fjy0             = _mm_setzero_pd();
1213             fjz0             = _mm_setzero_pd();
1214             fjx1             = _mm_setzero_pd();
1215             fjy1             = _mm_setzero_pd();
1216             fjz1             = _mm_setzero_pd();
1217             fjx2             = _mm_setzero_pd();
1218             fjy2             = _mm_setzero_pd();
1219             fjz2             = _mm_setzero_pd();
1220
1221             /**************************
1222              * CALCULATE INTERACTIONS *
1223              **************************/
1224
1225             r00              = _mm_mul_pd(rsq00,rinv00);
1226
1227             /* Calculate table index by multiplying r with table scale and truncate to integer */
1228             rt               = _mm_mul_pd(r00,vftabscale);
1229             vfitab           = _mm_cvttpd_epi32(rt);
1230             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1231             vfitab           = _mm_slli_epi32(vfitab,3);
1232
1233             /* COULOMB ELECTROSTATICS */
1234             velec            = _mm_mul_pd(qq00,rinv00);
1235             felec            = _mm_mul_pd(velec,rinvsq00);
1236
1237             /* CUBIC SPLINE TABLE DISPERSION */
1238             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1239             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1240             GMX_MM_TRANSPOSE2_PD(Y,F);
1241             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1242             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1243             GMX_MM_TRANSPOSE2_PD(G,H);
1244             Heps             = _mm_mul_pd(vfeps,H);
1245             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1246             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1247             fvdw6            = _mm_mul_pd(c6_00,FF);
1248
1249             /* CUBIC SPLINE TABLE REPULSION */
1250             vfitab           = _mm_add_epi32(vfitab,ifour);
1251             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1252             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1253             GMX_MM_TRANSPOSE2_PD(Y,F);
1254             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1255             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1256             GMX_MM_TRANSPOSE2_PD(G,H);
1257             Heps             = _mm_mul_pd(vfeps,H);
1258             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1259             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1260             fvdw12           = _mm_mul_pd(c12_00,FF);
1261             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1262
1263             fscal            = _mm_add_pd(felec,fvdw);
1264
1265             /* Calculate temporary vectorial force */
1266             tx               = _mm_mul_pd(fscal,dx00);
1267             ty               = _mm_mul_pd(fscal,dy00);
1268             tz               = _mm_mul_pd(fscal,dz00);
1269
1270             /* Update vectorial force */
1271             fix0             = _mm_add_pd(fix0,tx);
1272             fiy0             = _mm_add_pd(fiy0,ty);
1273             fiz0             = _mm_add_pd(fiz0,tz);
1274
1275             fjx0             = _mm_add_pd(fjx0,tx);
1276             fjy0             = _mm_add_pd(fjy0,ty);
1277             fjz0             = _mm_add_pd(fjz0,tz);
1278
1279             /**************************
1280              * CALCULATE INTERACTIONS *
1281              **************************/
1282
1283             /* COULOMB ELECTROSTATICS */
1284             velec            = _mm_mul_pd(qq01,rinv01);
1285             felec            = _mm_mul_pd(velec,rinvsq01);
1286
1287             fscal            = felec;
1288
1289             /* Calculate temporary vectorial force */
1290             tx               = _mm_mul_pd(fscal,dx01);
1291             ty               = _mm_mul_pd(fscal,dy01);
1292             tz               = _mm_mul_pd(fscal,dz01);
1293
1294             /* Update vectorial force */
1295             fix0             = _mm_add_pd(fix0,tx);
1296             fiy0             = _mm_add_pd(fiy0,ty);
1297             fiz0             = _mm_add_pd(fiz0,tz);
1298
1299             fjx1             = _mm_add_pd(fjx1,tx);
1300             fjy1             = _mm_add_pd(fjy1,ty);
1301             fjz1             = _mm_add_pd(fjz1,tz);
1302
1303             /**************************
1304              * CALCULATE INTERACTIONS *
1305              **************************/
1306
1307             /* COULOMB ELECTROSTATICS */
1308             velec            = _mm_mul_pd(qq02,rinv02);
1309             felec            = _mm_mul_pd(velec,rinvsq02);
1310
1311             fscal            = felec;
1312
1313             /* Calculate temporary vectorial force */
1314             tx               = _mm_mul_pd(fscal,dx02);
1315             ty               = _mm_mul_pd(fscal,dy02);
1316             tz               = _mm_mul_pd(fscal,dz02);
1317
1318             /* Update vectorial force */
1319             fix0             = _mm_add_pd(fix0,tx);
1320             fiy0             = _mm_add_pd(fiy0,ty);
1321             fiz0             = _mm_add_pd(fiz0,tz);
1322
1323             fjx2             = _mm_add_pd(fjx2,tx);
1324             fjy2             = _mm_add_pd(fjy2,ty);
1325             fjz2             = _mm_add_pd(fjz2,tz);
1326
1327             /**************************
1328              * CALCULATE INTERACTIONS *
1329              **************************/
1330
1331             /* COULOMB ELECTROSTATICS */
1332             velec            = _mm_mul_pd(qq10,rinv10);
1333             felec            = _mm_mul_pd(velec,rinvsq10);
1334
1335             fscal            = felec;
1336
1337             /* Calculate temporary vectorial force */
1338             tx               = _mm_mul_pd(fscal,dx10);
1339             ty               = _mm_mul_pd(fscal,dy10);
1340             tz               = _mm_mul_pd(fscal,dz10);
1341
1342             /* Update vectorial force */
1343             fix1             = _mm_add_pd(fix1,tx);
1344             fiy1             = _mm_add_pd(fiy1,ty);
1345             fiz1             = _mm_add_pd(fiz1,tz);
1346
1347             fjx0             = _mm_add_pd(fjx0,tx);
1348             fjy0             = _mm_add_pd(fjy0,ty);
1349             fjz0             = _mm_add_pd(fjz0,tz);
1350
1351             /**************************
1352              * CALCULATE INTERACTIONS *
1353              **************************/
1354
1355             /* COULOMB ELECTROSTATICS */
1356             velec            = _mm_mul_pd(qq11,rinv11);
1357             felec            = _mm_mul_pd(velec,rinvsq11);
1358
1359             fscal            = felec;
1360
1361             /* Calculate temporary vectorial force */
1362             tx               = _mm_mul_pd(fscal,dx11);
1363             ty               = _mm_mul_pd(fscal,dy11);
1364             tz               = _mm_mul_pd(fscal,dz11);
1365
1366             /* Update vectorial force */
1367             fix1             = _mm_add_pd(fix1,tx);
1368             fiy1             = _mm_add_pd(fiy1,ty);
1369             fiz1             = _mm_add_pd(fiz1,tz);
1370
1371             fjx1             = _mm_add_pd(fjx1,tx);
1372             fjy1             = _mm_add_pd(fjy1,ty);
1373             fjz1             = _mm_add_pd(fjz1,tz);
1374
1375             /**************************
1376              * CALCULATE INTERACTIONS *
1377              **************************/
1378
1379             /* COULOMB ELECTROSTATICS */
1380             velec            = _mm_mul_pd(qq12,rinv12);
1381             felec            = _mm_mul_pd(velec,rinvsq12);
1382
1383             fscal            = felec;
1384
1385             /* Calculate temporary vectorial force */
1386             tx               = _mm_mul_pd(fscal,dx12);
1387             ty               = _mm_mul_pd(fscal,dy12);
1388             tz               = _mm_mul_pd(fscal,dz12);
1389
1390             /* Update vectorial force */
1391             fix1             = _mm_add_pd(fix1,tx);
1392             fiy1             = _mm_add_pd(fiy1,ty);
1393             fiz1             = _mm_add_pd(fiz1,tz);
1394
1395             fjx2             = _mm_add_pd(fjx2,tx);
1396             fjy2             = _mm_add_pd(fjy2,ty);
1397             fjz2             = _mm_add_pd(fjz2,tz);
1398
1399             /**************************
1400              * CALCULATE INTERACTIONS *
1401              **************************/
1402
1403             /* COULOMB ELECTROSTATICS */
1404             velec            = _mm_mul_pd(qq20,rinv20);
1405             felec            = _mm_mul_pd(velec,rinvsq20);
1406
1407             fscal            = felec;
1408
1409             /* Calculate temporary vectorial force */
1410             tx               = _mm_mul_pd(fscal,dx20);
1411             ty               = _mm_mul_pd(fscal,dy20);
1412             tz               = _mm_mul_pd(fscal,dz20);
1413
1414             /* Update vectorial force */
1415             fix2             = _mm_add_pd(fix2,tx);
1416             fiy2             = _mm_add_pd(fiy2,ty);
1417             fiz2             = _mm_add_pd(fiz2,tz);
1418
1419             fjx0             = _mm_add_pd(fjx0,tx);
1420             fjy0             = _mm_add_pd(fjy0,ty);
1421             fjz0             = _mm_add_pd(fjz0,tz);
1422
1423             /**************************
1424              * CALCULATE INTERACTIONS *
1425              **************************/
1426
1427             /* COULOMB ELECTROSTATICS */
1428             velec            = _mm_mul_pd(qq21,rinv21);
1429             felec            = _mm_mul_pd(velec,rinvsq21);
1430
1431             fscal            = felec;
1432
1433             /* Calculate temporary vectorial force */
1434             tx               = _mm_mul_pd(fscal,dx21);
1435             ty               = _mm_mul_pd(fscal,dy21);
1436             tz               = _mm_mul_pd(fscal,dz21);
1437
1438             /* Update vectorial force */
1439             fix2             = _mm_add_pd(fix2,tx);
1440             fiy2             = _mm_add_pd(fiy2,ty);
1441             fiz2             = _mm_add_pd(fiz2,tz);
1442
1443             fjx1             = _mm_add_pd(fjx1,tx);
1444             fjy1             = _mm_add_pd(fjy1,ty);
1445             fjz1             = _mm_add_pd(fjz1,tz);
1446
1447             /**************************
1448              * CALCULATE INTERACTIONS *
1449              **************************/
1450
1451             /* COULOMB ELECTROSTATICS */
1452             velec            = _mm_mul_pd(qq22,rinv22);
1453             felec            = _mm_mul_pd(velec,rinvsq22);
1454
1455             fscal            = felec;
1456
1457             /* Calculate temporary vectorial force */
1458             tx               = _mm_mul_pd(fscal,dx22);
1459             ty               = _mm_mul_pd(fscal,dy22);
1460             tz               = _mm_mul_pd(fscal,dz22);
1461
1462             /* Update vectorial force */
1463             fix2             = _mm_add_pd(fix2,tx);
1464             fiy2             = _mm_add_pd(fiy2,ty);
1465             fiz2             = _mm_add_pd(fiz2,tz);
1466
1467             fjx2             = _mm_add_pd(fjx2,tx);
1468             fjy2             = _mm_add_pd(fjy2,ty);
1469             fjz2             = _mm_add_pd(fjz2,tz);
1470
1471             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1472
1473             /* Inner loop uses 270 flops */
1474         }
1475
1476         if(jidx<j_index_end)
1477         {
1478
1479             jnrA             = jjnr[jidx];
1480             j_coord_offsetA  = DIM*jnrA;
1481
1482             /* load j atom coordinates */
1483             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1484                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1485
1486             /* Calculate displacement vector */
1487             dx00             = _mm_sub_pd(ix0,jx0);
1488             dy00             = _mm_sub_pd(iy0,jy0);
1489             dz00             = _mm_sub_pd(iz0,jz0);
1490             dx01             = _mm_sub_pd(ix0,jx1);
1491             dy01             = _mm_sub_pd(iy0,jy1);
1492             dz01             = _mm_sub_pd(iz0,jz1);
1493             dx02             = _mm_sub_pd(ix0,jx2);
1494             dy02             = _mm_sub_pd(iy0,jy2);
1495             dz02             = _mm_sub_pd(iz0,jz2);
1496             dx10             = _mm_sub_pd(ix1,jx0);
1497             dy10             = _mm_sub_pd(iy1,jy0);
1498             dz10             = _mm_sub_pd(iz1,jz0);
1499             dx11             = _mm_sub_pd(ix1,jx1);
1500             dy11             = _mm_sub_pd(iy1,jy1);
1501             dz11             = _mm_sub_pd(iz1,jz1);
1502             dx12             = _mm_sub_pd(ix1,jx2);
1503             dy12             = _mm_sub_pd(iy1,jy2);
1504             dz12             = _mm_sub_pd(iz1,jz2);
1505             dx20             = _mm_sub_pd(ix2,jx0);
1506             dy20             = _mm_sub_pd(iy2,jy0);
1507             dz20             = _mm_sub_pd(iz2,jz0);
1508             dx21             = _mm_sub_pd(ix2,jx1);
1509             dy21             = _mm_sub_pd(iy2,jy1);
1510             dz21             = _mm_sub_pd(iz2,jz1);
1511             dx22             = _mm_sub_pd(ix2,jx2);
1512             dy22             = _mm_sub_pd(iy2,jy2);
1513             dz22             = _mm_sub_pd(iz2,jz2);
1514
1515             /* Calculate squared distance and things based on it */
1516             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1517             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1518             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1519             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1520             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1521             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1522             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1523             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1524             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1525
1526             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1527             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1528             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1529             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1530             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1531             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1532             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1533             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1534             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1535
1536             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1537             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1538             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1539             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1540             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1541             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1542             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1543             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1544             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1545
1546             fjx0             = _mm_setzero_pd();
1547             fjy0             = _mm_setzero_pd();
1548             fjz0             = _mm_setzero_pd();
1549             fjx1             = _mm_setzero_pd();
1550             fjy1             = _mm_setzero_pd();
1551             fjz1             = _mm_setzero_pd();
1552             fjx2             = _mm_setzero_pd();
1553             fjy2             = _mm_setzero_pd();
1554             fjz2             = _mm_setzero_pd();
1555
1556             /**************************
1557              * CALCULATE INTERACTIONS *
1558              **************************/
1559
1560             r00              = _mm_mul_pd(rsq00,rinv00);
1561
1562             /* Calculate table index by multiplying r with table scale and truncate to integer */
1563             rt               = _mm_mul_pd(r00,vftabscale);
1564             vfitab           = _mm_cvttpd_epi32(rt);
1565             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1566             vfitab           = _mm_slli_epi32(vfitab,3);
1567
1568             /* COULOMB ELECTROSTATICS */
1569             velec            = _mm_mul_pd(qq00,rinv00);
1570             felec            = _mm_mul_pd(velec,rinvsq00);
1571
1572             /* CUBIC SPLINE TABLE DISPERSION */
1573             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1574             F                = _mm_setzero_pd();
1575             GMX_MM_TRANSPOSE2_PD(Y,F);
1576             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1577             H                = _mm_setzero_pd();
1578             GMX_MM_TRANSPOSE2_PD(G,H);
1579             Heps             = _mm_mul_pd(vfeps,H);
1580             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1581             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1582             fvdw6            = _mm_mul_pd(c6_00,FF);
1583
1584             /* CUBIC SPLINE TABLE REPULSION */
1585             vfitab           = _mm_add_epi32(vfitab,ifour);
1586             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1587             F                = _mm_setzero_pd();
1588             GMX_MM_TRANSPOSE2_PD(Y,F);
1589             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1590             H                = _mm_setzero_pd();
1591             GMX_MM_TRANSPOSE2_PD(G,H);
1592             Heps             = _mm_mul_pd(vfeps,H);
1593             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1594             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1595             fvdw12           = _mm_mul_pd(c12_00,FF);
1596             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1597
1598             fscal            = _mm_add_pd(felec,fvdw);
1599
1600             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1601
1602             /* Calculate temporary vectorial force */
1603             tx               = _mm_mul_pd(fscal,dx00);
1604             ty               = _mm_mul_pd(fscal,dy00);
1605             tz               = _mm_mul_pd(fscal,dz00);
1606
1607             /* Update vectorial force */
1608             fix0             = _mm_add_pd(fix0,tx);
1609             fiy0             = _mm_add_pd(fiy0,ty);
1610             fiz0             = _mm_add_pd(fiz0,tz);
1611
1612             fjx0             = _mm_add_pd(fjx0,tx);
1613             fjy0             = _mm_add_pd(fjy0,ty);
1614             fjz0             = _mm_add_pd(fjz0,tz);
1615
1616             /**************************
1617              * CALCULATE INTERACTIONS *
1618              **************************/
1619
1620             /* COULOMB ELECTROSTATICS */
1621             velec            = _mm_mul_pd(qq01,rinv01);
1622             felec            = _mm_mul_pd(velec,rinvsq01);
1623
1624             fscal            = felec;
1625
1626             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1627
1628             /* Calculate temporary vectorial force */
1629             tx               = _mm_mul_pd(fscal,dx01);
1630             ty               = _mm_mul_pd(fscal,dy01);
1631             tz               = _mm_mul_pd(fscal,dz01);
1632
1633             /* Update vectorial force */
1634             fix0             = _mm_add_pd(fix0,tx);
1635             fiy0             = _mm_add_pd(fiy0,ty);
1636             fiz0             = _mm_add_pd(fiz0,tz);
1637
1638             fjx1             = _mm_add_pd(fjx1,tx);
1639             fjy1             = _mm_add_pd(fjy1,ty);
1640             fjz1             = _mm_add_pd(fjz1,tz);
1641
1642             /**************************
1643              * CALCULATE INTERACTIONS *
1644              **************************/
1645
1646             /* COULOMB ELECTROSTATICS */
1647             velec            = _mm_mul_pd(qq02,rinv02);
1648             felec            = _mm_mul_pd(velec,rinvsq02);
1649
1650             fscal            = felec;
1651
1652             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1653
1654             /* Calculate temporary vectorial force */
1655             tx               = _mm_mul_pd(fscal,dx02);
1656             ty               = _mm_mul_pd(fscal,dy02);
1657             tz               = _mm_mul_pd(fscal,dz02);
1658
1659             /* Update vectorial force */
1660             fix0             = _mm_add_pd(fix0,tx);
1661             fiy0             = _mm_add_pd(fiy0,ty);
1662             fiz0             = _mm_add_pd(fiz0,tz);
1663
1664             fjx2             = _mm_add_pd(fjx2,tx);
1665             fjy2             = _mm_add_pd(fjy2,ty);
1666             fjz2             = _mm_add_pd(fjz2,tz);
1667
1668             /**************************
1669              * CALCULATE INTERACTIONS *
1670              **************************/
1671
1672             /* COULOMB ELECTROSTATICS */
1673             velec            = _mm_mul_pd(qq10,rinv10);
1674             felec            = _mm_mul_pd(velec,rinvsq10);
1675
1676             fscal            = felec;
1677
1678             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1679
1680             /* Calculate temporary vectorial force */
1681             tx               = _mm_mul_pd(fscal,dx10);
1682             ty               = _mm_mul_pd(fscal,dy10);
1683             tz               = _mm_mul_pd(fscal,dz10);
1684
1685             /* Update vectorial force */
1686             fix1             = _mm_add_pd(fix1,tx);
1687             fiy1             = _mm_add_pd(fiy1,ty);
1688             fiz1             = _mm_add_pd(fiz1,tz);
1689
1690             fjx0             = _mm_add_pd(fjx0,tx);
1691             fjy0             = _mm_add_pd(fjy0,ty);
1692             fjz0             = _mm_add_pd(fjz0,tz);
1693
1694             /**************************
1695              * CALCULATE INTERACTIONS *
1696              **************************/
1697
1698             /* COULOMB ELECTROSTATICS */
1699             velec            = _mm_mul_pd(qq11,rinv11);
1700             felec            = _mm_mul_pd(velec,rinvsq11);
1701
1702             fscal            = felec;
1703
1704             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1705
1706             /* Calculate temporary vectorial force */
1707             tx               = _mm_mul_pd(fscal,dx11);
1708             ty               = _mm_mul_pd(fscal,dy11);
1709             tz               = _mm_mul_pd(fscal,dz11);
1710
1711             /* Update vectorial force */
1712             fix1             = _mm_add_pd(fix1,tx);
1713             fiy1             = _mm_add_pd(fiy1,ty);
1714             fiz1             = _mm_add_pd(fiz1,tz);
1715
1716             fjx1             = _mm_add_pd(fjx1,tx);
1717             fjy1             = _mm_add_pd(fjy1,ty);
1718             fjz1             = _mm_add_pd(fjz1,tz);
1719
1720             /**************************
1721              * CALCULATE INTERACTIONS *
1722              **************************/
1723
1724             /* COULOMB ELECTROSTATICS */
1725             velec            = _mm_mul_pd(qq12,rinv12);
1726             felec            = _mm_mul_pd(velec,rinvsq12);
1727
1728             fscal            = felec;
1729
1730             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1731
1732             /* Calculate temporary vectorial force */
1733             tx               = _mm_mul_pd(fscal,dx12);
1734             ty               = _mm_mul_pd(fscal,dy12);
1735             tz               = _mm_mul_pd(fscal,dz12);
1736
1737             /* Update vectorial force */
1738             fix1             = _mm_add_pd(fix1,tx);
1739             fiy1             = _mm_add_pd(fiy1,ty);
1740             fiz1             = _mm_add_pd(fiz1,tz);
1741
1742             fjx2             = _mm_add_pd(fjx2,tx);
1743             fjy2             = _mm_add_pd(fjy2,ty);
1744             fjz2             = _mm_add_pd(fjz2,tz);
1745
1746             /**************************
1747              * CALCULATE INTERACTIONS *
1748              **************************/
1749
1750             /* COULOMB ELECTROSTATICS */
1751             velec            = _mm_mul_pd(qq20,rinv20);
1752             felec            = _mm_mul_pd(velec,rinvsq20);
1753
1754             fscal            = felec;
1755
1756             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1757
1758             /* Calculate temporary vectorial force */
1759             tx               = _mm_mul_pd(fscal,dx20);
1760             ty               = _mm_mul_pd(fscal,dy20);
1761             tz               = _mm_mul_pd(fscal,dz20);
1762
1763             /* Update vectorial force */
1764             fix2             = _mm_add_pd(fix2,tx);
1765             fiy2             = _mm_add_pd(fiy2,ty);
1766             fiz2             = _mm_add_pd(fiz2,tz);
1767
1768             fjx0             = _mm_add_pd(fjx0,tx);
1769             fjy0             = _mm_add_pd(fjy0,ty);
1770             fjz0             = _mm_add_pd(fjz0,tz);
1771
1772             /**************************
1773              * CALCULATE INTERACTIONS *
1774              **************************/
1775
1776             /* COULOMB ELECTROSTATICS */
1777             velec            = _mm_mul_pd(qq21,rinv21);
1778             felec            = _mm_mul_pd(velec,rinvsq21);
1779
1780             fscal            = felec;
1781
1782             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1783
1784             /* Calculate temporary vectorial force */
1785             tx               = _mm_mul_pd(fscal,dx21);
1786             ty               = _mm_mul_pd(fscal,dy21);
1787             tz               = _mm_mul_pd(fscal,dz21);
1788
1789             /* Update vectorial force */
1790             fix2             = _mm_add_pd(fix2,tx);
1791             fiy2             = _mm_add_pd(fiy2,ty);
1792             fiz2             = _mm_add_pd(fiz2,tz);
1793
1794             fjx1             = _mm_add_pd(fjx1,tx);
1795             fjy1             = _mm_add_pd(fjy1,ty);
1796             fjz1             = _mm_add_pd(fjz1,tz);
1797
1798             /**************************
1799              * CALCULATE INTERACTIONS *
1800              **************************/
1801
1802             /* COULOMB ELECTROSTATICS */
1803             velec            = _mm_mul_pd(qq22,rinv22);
1804             felec            = _mm_mul_pd(velec,rinvsq22);
1805
1806             fscal            = felec;
1807
1808             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1809
1810             /* Calculate temporary vectorial force */
1811             tx               = _mm_mul_pd(fscal,dx22);
1812             ty               = _mm_mul_pd(fscal,dy22);
1813             tz               = _mm_mul_pd(fscal,dz22);
1814
1815             /* Update vectorial force */
1816             fix2             = _mm_add_pd(fix2,tx);
1817             fiy2             = _mm_add_pd(fiy2,ty);
1818             fiz2             = _mm_add_pd(fiz2,tz);
1819
1820             fjx2             = _mm_add_pd(fjx2,tx);
1821             fjy2             = _mm_add_pd(fjy2,ty);
1822             fjz2             = _mm_add_pd(fjz2,tz);
1823
1824             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1825
1826             /* Inner loop uses 270 flops */
1827         }
1828
1829         /* End of innermost loop */
1830
1831         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1832                                               f+i_coord_offset,fshift+i_shift_offset);
1833
1834         /* Increment number of inner iterations */
1835         inneriter                  += j_index_end - j_index_start;
1836
1837         /* Outer loop uses 18 flops */
1838     }
1839
1840     /* Increment number of outer iterations */
1841     outeriter        += nri;
1842
1843     /* Update outer/inner flops */
1844
1845     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*270);
1846 }