b498e5d613566ea5c01dc6541e29133b53cac9a9
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_double.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B;
75     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B;
77     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
94     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
95     __m128i          ewitab;
96     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
97     real             *ewtab;
98     __m128d          dummy_mask,cutoff_mask;
99     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
100     __m128d          one     = _mm_set1_pd(1.0);
101     __m128d          two     = _mm_set1_pd(2.0);
102     x                = xx[0];
103     f                = ff[0];
104
105     nri              = nlist->nri;
106     iinr             = nlist->iinr;
107     jindex           = nlist->jindex;
108     jjnr             = nlist->jjnr;
109     shiftidx         = nlist->shift;
110     gid              = nlist->gid;
111     shiftvec         = fr->shift_vec[0];
112     fshift           = fr->fshift[0];
113     facel            = _mm_set1_pd(fr->epsfac);
114     charge           = mdatoms->chargeA;
115     nvdwtype         = fr->ntype;
116     vdwparam         = fr->nbfp;
117     vdwtype          = mdatoms->typeA;
118
119     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
120     ewtab            = fr->ic->tabq_coul_FDV0;
121     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
122     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
123
124     /* Setup water-specific parameters */
125     inr              = nlist->iinr[0];
126     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
127     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
128     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
129     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
130
131     jq0              = _mm_set1_pd(charge[inr+0]);
132     jq1              = _mm_set1_pd(charge[inr+1]);
133     jq2              = _mm_set1_pd(charge[inr+2]);
134     vdwjidx0A        = 2*vdwtype[inr+0];
135     qq00             = _mm_mul_pd(iq0,jq0);
136     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
137     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
138     qq01             = _mm_mul_pd(iq0,jq1);
139     qq02             = _mm_mul_pd(iq0,jq2);
140     qq10             = _mm_mul_pd(iq1,jq0);
141     qq11             = _mm_mul_pd(iq1,jq1);
142     qq12             = _mm_mul_pd(iq1,jq2);
143     qq20             = _mm_mul_pd(iq2,jq0);
144     qq21             = _mm_mul_pd(iq2,jq1);
145     qq22             = _mm_mul_pd(iq2,jq2);
146
147     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148     rcutoff_scalar   = fr->rcoulomb;
149     rcutoff          = _mm_set1_pd(rcutoff_scalar);
150     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
151
152     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
153     rvdw             = _mm_set1_pd(fr->rvdw);
154
155     /* Avoid stupid compiler warnings */
156     jnrA = jnrB = 0;
157     j_coord_offsetA = 0;
158     j_coord_offsetB = 0;
159
160     outeriter        = 0;
161     inneriter        = 0;
162
163     /* Start outer loop over neighborlists */
164     for(iidx=0; iidx<nri; iidx++)
165     {
166         /* Load shift vector for this list */
167         i_shift_offset   = DIM*shiftidx[iidx];
168
169         /* Load limits for loop over neighbors */
170         j_index_start    = jindex[iidx];
171         j_index_end      = jindex[iidx+1];
172
173         /* Get outer coordinate index */
174         inr              = iinr[iidx];
175         i_coord_offset   = DIM*inr;
176
177         /* Load i particle coords and add shift vector */
178         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
179                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
180
181         fix0             = _mm_setzero_pd();
182         fiy0             = _mm_setzero_pd();
183         fiz0             = _mm_setzero_pd();
184         fix1             = _mm_setzero_pd();
185         fiy1             = _mm_setzero_pd();
186         fiz1             = _mm_setzero_pd();
187         fix2             = _mm_setzero_pd();
188         fiy2             = _mm_setzero_pd();
189         fiz2             = _mm_setzero_pd();
190
191         /* Reset potential sums */
192         velecsum         = _mm_setzero_pd();
193         vvdwsum          = _mm_setzero_pd();
194
195         /* Start inner kernel loop */
196         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
197         {
198
199             /* Get j neighbor index, and coordinate index */
200             jnrA             = jjnr[jidx];
201             jnrB             = jjnr[jidx+1];
202             j_coord_offsetA  = DIM*jnrA;
203             j_coord_offsetB  = DIM*jnrB;
204             
205             /* load j atom coordinates */
206             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
207                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
208             
209             /* Calculate displacement vector */
210             dx00             = _mm_sub_pd(ix0,jx0);
211             dy00             = _mm_sub_pd(iy0,jy0);
212             dz00             = _mm_sub_pd(iz0,jz0);
213             dx01             = _mm_sub_pd(ix0,jx1);
214             dy01             = _mm_sub_pd(iy0,jy1);
215             dz01             = _mm_sub_pd(iz0,jz1);
216             dx02             = _mm_sub_pd(ix0,jx2);
217             dy02             = _mm_sub_pd(iy0,jy2);
218             dz02             = _mm_sub_pd(iz0,jz2);
219             dx10             = _mm_sub_pd(ix1,jx0);
220             dy10             = _mm_sub_pd(iy1,jy0);
221             dz10             = _mm_sub_pd(iz1,jz0);
222             dx11             = _mm_sub_pd(ix1,jx1);
223             dy11             = _mm_sub_pd(iy1,jy1);
224             dz11             = _mm_sub_pd(iz1,jz1);
225             dx12             = _mm_sub_pd(ix1,jx2);
226             dy12             = _mm_sub_pd(iy1,jy2);
227             dz12             = _mm_sub_pd(iz1,jz2);
228             dx20             = _mm_sub_pd(ix2,jx0);
229             dy20             = _mm_sub_pd(iy2,jy0);
230             dz20             = _mm_sub_pd(iz2,jz0);
231             dx21             = _mm_sub_pd(ix2,jx1);
232             dy21             = _mm_sub_pd(iy2,jy1);
233             dz21             = _mm_sub_pd(iz2,jz1);
234             dx22             = _mm_sub_pd(ix2,jx2);
235             dy22             = _mm_sub_pd(iy2,jy2);
236             dz22             = _mm_sub_pd(iz2,jz2);
237
238             /* Calculate squared distance and things based on it */
239             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
240             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
241             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
242             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
243             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
244             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
245             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
246             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
247             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
248
249             rinv00           = gmx_mm_invsqrt_pd(rsq00);
250             rinv01           = gmx_mm_invsqrt_pd(rsq01);
251             rinv02           = gmx_mm_invsqrt_pd(rsq02);
252             rinv10           = gmx_mm_invsqrt_pd(rsq10);
253             rinv11           = gmx_mm_invsqrt_pd(rsq11);
254             rinv12           = gmx_mm_invsqrt_pd(rsq12);
255             rinv20           = gmx_mm_invsqrt_pd(rsq20);
256             rinv21           = gmx_mm_invsqrt_pd(rsq21);
257             rinv22           = gmx_mm_invsqrt_pd(rsq22);
258
259             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
260             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
261             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
262             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
263             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
264             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
265             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
266             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
267             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
268
269             fjx0             = _mm_setzero_pd();
270             fjy0             = _mm_setzero_pd();
271             fjz0             = _mm_setzero_pd();
272             fjx1             = _mm_setzero_pd();
273             fjy1             = _mm_setzero_pd();
274             fjz1             = _mm_setzero_pd();
275             fjx2             = _mm_setzero_pd();
276             fjy2             = _mm_setzero_pd();
277             fjz2             = _mm_setzero_pd();
278
279             /**************************
280              * CALCULATE INTERACTIONS *
281              **************************/
282
283             if (gmx_mm_any_lt(rsq00,rcutoff2))
284             {
285
286             r00              = _mm_mul_pd(rsq00,rinv00);
287
288             /* EWALD ELECTROSTATICS */
289
290             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
291             ewrt             = _mm_mul_pd(r00,ewtabscale);
292             ewitab           = _mm_cvttpd_epi32(ewrt);
293             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
294             ewitab           = _mm_slli_epi32(ewitab,2);
295             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
296             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
297             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
298             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
299             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
300             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
301             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
302             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
303             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
304             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
305
306             /* LENNARD-JONES DISPERSION/REPULSION */
307
308             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
309             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
310             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
311             vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
312                                           _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
313             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
314
315             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
316
317             /* Update potential sum for this i atom from the interaction with this j atom. */
318             velec            = _mm_and_pd(velec,cutoff_mask);
319             velecsum         = _mm_add_pd(velecsum,velec);
320             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
321             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
322
323             fscal            = _mm_add_pd(felec,fvdw);
324
325             fscal            = _mm_and_pd(fscal,cutoff_mask);
326
327             /* Calculate temporary vectorial force */
328             tx               = _mm_mul_pd(fscal,dx00);
329             ty               = _mm_mul_pd(fscal,dy00);
330             tz               = _mm_mul_pd(fscal,dz00);
331
332             /* Update vectorial force */
333             fix0             = _mm_add_pd(fix0,tx);
334             fiy0             = _mm_add_pd(fiy0,ty);
335             fiz0             = _mm_add_pd(fiz0,tz);
336
337             fjx0             = _mm_add_pd(fjx0,tx);
338             fjy0             = _mm_add_pd(fjy0,ty);
339             fjz0             = _mm_add_pd(fjz0,tz);
340
341             }
342
343             /**************************
344              * CALCULATE INTERACTIONS *
345              **************************/
346
347             if (gmx_mm_any_lt(rsq01,rcutoff2))
348             {
349
350             r01              = _mm_mul_pd(rsq01,rinv01);
351
352             /* EWALD ELECTROSTATICS */
353
354             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
355             ewrt             = _mm_mul_pd(r01,ewtabscale);
356             ewitab           = _mm_cvttpd_epi32(ewrt);
357             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
358             ewitab           = _mm_slli_epi32(ewitab,2);
359             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
360             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
361             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
362             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
363             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
364             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
365             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
366             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
367             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
368             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
369
370             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
371
372             /* Update potential sum for this i atom from the interaction with this j atom. */
373             velec            = _mm_and_pd(velec,cutoff_mask);
374             velecsum         = _mm_add_pd(velecsum,velec);
375
376             fscal            = felec;
377
378             fscal            = _mm_and_pd(fscal,cutoff_mask);
379
380             /* Calculate temporary vectorial force */
381             tx               = _mm_mul_pd(fscal,dx01);
382             ty               = _mm_mul_pd(fscal,dy01);
383             tz               = _mm_mul_pd(fscal,dz01);
384
385             /* Update vectorial force */
386             fix0             = _mm_add_pd(fix0,tx);
387             fiy0             = _mm_add_pd(fiy0,ty);
388             fiz0             = _mm_add_pd(fiz0,tz);
389
390             fjx1             = _mm_add_pd(fjx1,tx);
391             fjy1             = _mm_add_pd(fjy1,ty);
392             fjz1             = _mm_add_pd(fjz1,tz);
393
394             }
395
396             /**************************
397              * CALCULATE INTERACTIONS *
398              **************************/
399
400             if (gmx_mm_any_lt(rsq02,rcutoff2))
401             {
402
403             r02              = _mm_mul_pd(rsq02,rinv02);
404
405             /* EWALD ELECTROSTATICS */
406
407             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408             ewrt             = _mm_mul_pd(r02,ewtabscale);
409             ewitab           = _mm_cvttpd_epi32(ewrt);
410             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
411             ewitab           = _mm_slli_epi32(ewitab,2);
412             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
413             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
414             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
415             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
416             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
417             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
418             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
419             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
420             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
421             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
422
423             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
424
425             /* Update potential sum for this i atom from the interaction with this j atom. */
426             velec            = _mm_and_pd(velec,cutoff_mask);
427             velecsum         = _mm_add_pd(velecsum,velec);
428
429             fscal            = felec;
430
431             fscal            = _mm_and_pd(fscal,cutoff_mask);
432
433             /* Calculate temporary vectorial force */
434             tx               = _mm_mul_pd(fscal,dx02);
435             ty               = _mm_mul_pd(fscal,dy02);
436             tz               = _mm_mul_pd(fscal,dz02);
437
438             /* Update vectorial force */
439             fix0             = _mm_add_pd(fix0,tx);
440             fiy0             = _mm_add_pd(fiy0,ty);
441             fiz0             = _mm_add_pd(fiz0,tz);
442
443             fjx2             = _mm_add_pd(fjx2,tx);
444             fjy2             = _mm_add_pd(fjy2,ty);
445             fjz2             = _mm_add_pd(fjz2,tz);
446
447             }
448
449             /**************************
450              * CALCULATE INTERACTIONS *
451              **************************/
452
453             if (gmx_mm_any_lt(rsq10,rcutoff2))
454             {
455
456             r10              = _mm_mul_pd(rsq10,rinv10);
457
458             /* EWALD ELECTROSTATICS */
459
460             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
461             ewrt             = _mm_mul_pd(r10,ewtabscale);
462             ewitab           = _mm_cvttpd_epi32(ewrt);
463             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
464             ewitab           = _mm_slli_epi32(ewitab,2);
465             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
466             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
467             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
468             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
469             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
470             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
471             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
472             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
473             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
474             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
475
476             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
477
478             /* Update potential sum for this i atom from the interaction with this j atom. */
479             velec            = _mm_and_pd(velec,cutoff_mask);
480             velecsum         = _mm_add_pd(velecsum,velec);
481
482             fscal            = felec;
483
484             fscal            = _mm_and_pd(fscal,cutoff_mask);
485
486             /* Calculate temporary vectorial force */
487             tx               = _mm_mul_pd(fscal,dx10);
488             ty               = _mm_mul_pd(fscal,dy10);
489             tz               = _mm_mul_pd(fscal,dz10);
490
491             /* Update vectorial force */
492             fix1             = _mm_add_pd(fix1,tx);
493             fiy1             = _mm_add_pd(fiy1,ty);
494             fiz1             = _mm_add_pd(fiz1,tz);
495
496             fjx0             = _mm_add_pd(fjx0,tx);
497             fjy0             = _mm_add_pd(fjy0,ty);
498             fjz0             = _mm_add_pd(fjz0,tz);
499
500             }
501
502             /**************************
503              * CALCULATE INTERACTIONS *
504              **************************/
505
506             if (gmx_mm_any_lt(rsq11,rcutoff2))
507             {
508
509             r11              = _mm_mul_pd(rsq11,rinv11);
510
511             /* EWALD ELECTROSTATICS */
512
513             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514             ewrt             = _mm_mul_pd(r11,ewtabscale);
515             ewitab           = _mm_cvttpd_epi32(ewrt);
516             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
517             ewitab           = _mm_slli_epi32(ewitab,2);
518             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
519             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
520             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
521             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
522             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
523             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
524             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
525             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
526             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
527             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
528
529             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
530
531             /* Update potential sum for this i atom from the interaction with this j atom. */
532             velec            = _mm_and_pd(velec,cutoff_mask);
533             velecsum         = _mm_add_pd(velecsum,velec);
534
535             fscal            = felec;
536
537             fscal            = _mm_and_pd(fscal,cutoff_mask);
538
539             /* Calculate temporary vectorial force */
540             tx               = _mm_mul_pd(fscal,dx11);
541             ty               = _mm_mul_pd(fscal,dy11);
542             tz               = _mm_mul_pd(fscal,dz11);
543
544             /* Update vectorial force */
545             fix1             = _mm_add_pd(fix1,tx);
546             fiy1             = _mm_add_pd(fiy1,ty);
547             fiz1             = _mm_add_pd(fiz1,tz);
548
549             fjx1             = _mm_add_pd(fjx1,tx);
550             fjy1             = _mm_add_pd(fjy1,ty);
551             fjz1             = _mm_add_pd(fjz1,tz);
552
553             }
554
555             /**************************
556              * CALCULATE INTERACTIONS *
557              **************************/
558
559             if (gmx_mm_any_lt(rsq12,rcutoff2))
560             {
561
562             r12              = _mm_mul_pd(rsq12,rinv12);
563
564             /* EWALD ELECTROSTATICS */
565
566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
567             ewrt             = _mm_mul_pd(r12,ewtabscale);
568             ewitab           = _mm_cvttpd_epi32(ewrt);
569             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
570             ewitab           = _mm_slli_epi32(ewitab,2);
571             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
572             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
573             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
574             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
575             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
576             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
577             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
578             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
579             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
580             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
581
582             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
583
584             /* Update potential sum for this i atom from the interaction with this j atom. */
585             velec            = _mm_and_pd(velec,cutoff_mask);
586             velecsum         = _mm_add_pd(velecsum,velec);
587
588             fscal            = felec;
589
590             fscal            = _mm_and_pd(fscal,cutoff_mask);
591
592             /* Calculate temporary vectorial force */
593             tx               = _mm_mul_pd(fscal,dx12);
594             ty               = _mm_mul_pd(fscal,dy12);
595             tz               = _mm_mul_pd(fscal,dz12);
596
597             /* Update vectorial force */
598             fix1             = _mm_add_pd(fix1,tx);
599             fiy1             = _mm_add_pd(fiy1,ty);
600             fiz1             = _mm_add_pd(fiz1,tz);
601
602             fjx2             = _mm_add_pd(fjx2,tx);
603             fjy2             = _mm_add_pd(fjy2,ty);
604             fjz2             = _mm_add_pd(fjz2,tz);
605
606             }
607
608             /**************************
609              * CALCULATE INTERACTIONS *
610              **************************/
611
612             if (gmx_mm_any_lt(rsq20,rcutoff2))
613             {
614
615             r20              = _mm_mul_pd(rsq20,rinv20);
616
617             /* EWALD ELECTROSTATICS */
618
619             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
620             ewrt             = _mm_mul_pd(r20,ewtabscale);
621             ewitab           = _mm_cvttpd_epi32(ewrt);
622             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
623             ewitab           = _mm_slli_epi32(ewitab,2);
624             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
625             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
626             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
627             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
628             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
629             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
630             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
631             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
632             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
633             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
634
635             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
636
637             /* Update potential sum for this i atom from the interaction with this j atom. */
638             velec            = _mm_and_pd(velec,cutoff_mask);
639             velecsum         = _mm_add_pd(velecsum,velec);
640
641             fscal            = felec;
642
643             fscal            = _mm_and_pd(fscal,cutoff_mask);
644
645             /* Calculate temporary vectorial force */
646             tx               = _mm_mul_pd(fscal,dx20);
647             ty               = _mm_mul_pd(fscal,dy20);
648             tz               = _mm_mul_pd(fscal,dz20);
649
650             /* Update vectorial force */
651             fix2             = _mm_add_pd(fix2,tx);
652             fiy2             = _mm_add_pd(fiy2,ty);
653             fiz2             = _mm_add_pd(fiz2,tz);
654
655             fjx0             = _mm_add_pd(fjx0,tx);
656             fjy0             = _mm_add_pd(fjy0,ty);
657             fjz0             = _mm_add_pd(fjz0,tz);
658
659             }
660
661             /**************************
662              * CALCULATE INTERACTIONS *
663              **************************/
664
665             if (gmx_mm_any_lt(rsq21,rcutoff2))
666             {
667
668             r21              = _mm_mul_pd(rsq21,rinv21);
669
670             /* EWALD ELECTROSTATICS */
671
672             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
673             ewrt             = _mm_mul_pd(r21,ewtabscale);
674             ewitab           = _mm_cvttpd_epi32(ewrt);
675             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
676             ewitab           = _mm_slli_epi32(ewitab,2);
677             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
678             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
679             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
680             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
681             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
682             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
683             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
684             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
685             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
686             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
687
688             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
689
690             /* Update potential sum for this i atom from the interaction with this j atom. */
691             velec            = _mm_and_pd(velec,cutoff_mask);
692             velecsum         = _mm_add_pd(velecsum,velec);
693
694             fscal            = felec;
695
696             fscal            = _mm_and_pd(fscal,cutoff_mask);
697
698             /* Calculate temporary vectorial force */
699             tx               = _mm_mul_pd(fscal,dx21);
700             ty               = _mm_mul_pd(fscal,dy21);
701             tz               = _mm_mul_pd(fscal,dz21);
702
703             /* Update vectorial force */
704             fix2             = _mm_add_pd(fix2,tx);
705             fiy2             = _mm_add_pd(fiy2,ty);
706             fiz2             = _mm_add_pd(fiz2,tz);
707
708             fjx1             = _mm_add_pd(fjx1,tx);
709             fjy1             = _mm_add_pd(fjy1,ty);
710             fjz1             = _mm_add_pd(fjz1,tz);
711
712             }
713
714             /**************************
715              * CALCULATE INTERACTIONS *
716              **************************/
717
718             if (gmx_mm_any_lt(rsq22,rcutoff2))
719             {
720
721             r22              = _mm_mul_pd(rsq22,rinv22);
722
723             /* EWALD ELECTROSTATICS */
724
725             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
726             ewrt             = _mm_mul_pd(r22,ewtabscale);
727             ewitab           = _mm_cvttpd_epi32(ewrt);
728             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
729             ewitab           = _mm_slli_epi32(ewitab,2);
730             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
731             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
732             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
733             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
734             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
735             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
736             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
737             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
738             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
739             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
740
741             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
742
743             /* Update potential sum for this i atom from the interaction with this j atom. */
744             velec            = _mm_and_pd(velec,cutoff_mask);
745             velecsum         = _mm_add_pd(velecsum,velec);
746
747             fscal            = felec;
748
749             fscal            = _mm_and_pd(fscal,cutoff_mask);
750
751             /* Calculate temporary vectorial force */
752             tx               = _mm_mul_pd(fscal,dx22);
753             ty               = _mm_mul_pd(fscal,dy22);
754             tz               = _mm_mul_pd(fscal,dz22);
755
756             /* Update vectorial force */
757             fix2             = _mm_add_pd(fix2,tx);
758             fiy2             = _mm_add_pd(fiy2,ty);
759             fiz2             = _mm_add_pd(fiz2,tz);
760
761             fjx2             = _mm_add_pd(fjx2,tx);
762             fjy2             = _mm_add_pd(fjy2,ty);
763             fjz2             = _mm_add_pd(fjz2,tz);
764
765             }
766
767             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
768
769             /* Inner loop uses 432 flops */
770         }
771
772         if(jidx<j_index_end)
773         {
774
775             jnrA             = jjnr[jidx];
776             j_coord_offsetA  = DIM*jnrA;
777
778             /* load j atom coordinates */
779             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
780                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
781             
782             /* Calculate displacement vector */
783             dx00             = _mm_sub_pd(ix0,jx0);
784             dy00             = _mm_sub_pd(iy0,jy0);
785             dz00             = _mm_sub_pd(iz0,jz0);
786             dx01             = _mm_sub_pd(ix0,jx1);
787             dy01             = _mm_sub_pd(iy0,jy1);
788             dz01             = _mm_sub_pd(iz0,jz1);
789             dx02             = _mm_sub_pd(ix0,jx2);
790             dy02             = _mm_sub_pd(iy0,jy2);
791             dz02             = _mm_sub_pd(iz0,jz2);
792             dx10             = _mm_sub_pd(ix1,jx0);
793             dy10             = _mm_sub_pd(iy1,jy0);
794             dz10             = _mm_sub_pd(iz1,jz0);
795             dx11             = _mm_sub_pd(ix1,jx1);
796             dy11             = _mm_sub_pd(iy1,jy1);
797             dz11             = _mm_sub_pd(iz1,jz1);
798             dx12             = _mm_sub_pd(ix1,jx2);
799             dy12             = _mm_sub_pd(iy1,jy2);
800             dz12             = _mm_sub_pd(iz1,jz2);
801             dx20             = _mm_sub_pd(ix2,jx0);
802             dy20             = _mm_sub_pd(iy2,jy0);
803             dz20             = _mm_sub_pd(iz2,jz0);
804             dx21             = _mm_sub_pd(ix2,jx1);
805             dy21             = _mm_sub_pd(iy2,jy1);
806             dz21             = _mm_sub_pd(iz2,jz1);
807             dx22             = _mm_sub_pd(ix2,jx2);
808             dy22             = _mm_sub_pd(iy2,jy2);
809             dz22             = _mm_sub_pd(iz2,jz2);
810
811             /* Calculate squared distance and things based on it */
812             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
813             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
814             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
815             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
816             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
817             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
818             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
819             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
820             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
821
822             rinv00           = gmx_mm_invsqrt_pd(rsq00);
823             rinv01           = gmx_mm_invsqrt_pd(rsq01);
824             rinv02           = gmx_mm_invsqrt_pd(rsq02);
825             rinv10           = gmx_mm_invsqrt_pd(rsq10);
826             rinv11           = gmx_mm_invsqrt_pd(rsq11);
827             rinv12           = gmx_mm_invsqrt_pd(rsq12);
828             rinv20           = gmx_mm_invsqrt_pd(rsq20);
829             rinv21           = gmx_mm_invsqrt_pd(rsq21);
830             rinv22           = gmx_mm_invsqrt_pd(rsq22);
831
832             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
833             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
834             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
835             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
836             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
837             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
838             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
839             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
840             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
841
842             fjx0             = _mm_setzero_pd();
843             fjy0             = _mm_setzero_pd();
844             fjz0             = _mm_setzero_pd();
845             fjx1             = _mm_setzero_pd();
846             fjy1             = _mm_setzero_pd();
847             fjz1             = _mm_setzero_pd();
848             fjx2             = _mm_setzero_pd();
849             fjy2             = _mm_setzero_pd();
850             fjz2             = _mm_setzero_pd();
851
852             /**************************
853              * CALCULATE INTERACTIONS *
854              **************************/
855
856             if (gmx_mm_any_lt(rsq00,rcutoff2))
857             {
858
859             r00              = _mm_mul_pd(rsq00,rinv00);
860
861             /* EWALD ELECTROSTATICS */
862
863             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
864             ewrt             = _mm_mul_pd(r00,ewtabscale);
865             ewitab           = _mm_cvttpd_epi32(ewrt);
866             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
867             ewitab           = _mm_slli_epi32(ewitab,2);
868             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
869             ewtabD           = _mm_setzero_pd();
870             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
871             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
872             ewtabFn          = _mm_setzero_pd();
873             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
874             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
875             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
876             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
877             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
878
879             /* LENNARD-JONES DISPERSION/REPULSION */
880
881             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
882             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
883             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
884             vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
885                                           _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
886             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
887
888             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
889
890             /* Update potential sum for this i atom from the interaction with this j atom. */
891             velec            = _mm_and_pd(velec,cutoff_mask);
892             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
893             velecsum         = _mm_add_pd(velecsum,velec);
894             vvdw             = _mm_and_pd(vvdw,cutoff_mask);
895             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
896             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
897
898             fscal            = _mm_add_pd(felec,fvdw);
899
900             fscal            = _mm_and_pd(fscal,cutoff_mask);
901
902             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
903
904             /* Calculate temporary vectorial force */
905             tx               = _mm_mul_pd(fscal,dx00);
906             ty               = _mm_mul_pd(fscal,dy00);
907             tz               = _mm_mul_pd(fscal,dz00);
908
909             /* Update vectorial force */
910             fix0             = _mm_add_pd(fix0,tx);
911             fiy0             = _mm_add_pd(fiy0,ty);
912             fiz0             = _mm_add_pd(fiz0,tz);
913
914             fjx0             = _mm_add_pd(fjx0,tx);
915             fjy0             = _mm_add_pd(fjy0,ty);
916             fjz0             = _mm_add_pd(fjz0,tz);
917
918             }
919
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             if (gmx_mm_any_lt(rsq01,rcutoff2))
925             {
926
927             r01              = _mm_mul_pd(rsq01,rinv01);
928
929             /* EWALD ELECTROSTATICS */
930
931             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
932             ewrt             = _mm_mul_pd(r01,ewtabscale);
933             ewitab           = _mm_cvttpd_epi32(ewrt);
934             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
935             ewitab           = _mm_slli_epi32(ewitab,2);
936             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
937             ewtabD           = _mm_setzero_pd();
938             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
939             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
940             ewtabFn          = _mm_setzero_pd();
941             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
942             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
943             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
944             velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
945             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
946
947             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
948
949             /* Update potential sum for this i atom from the interaction with this j atom. */
950             velec            = _mm_and_pd(velec,cutoff_mask);
951             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
952             velecsum         = _mm_add_pd(velecsum,velec);
953
954             fscal            = felec;
955
956             fscal            = _mm_and_pd(fscal,cutoff_mask);
957
958             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
959
960             /* Calculate temporary vectorial force */
961             tx               = _mm_mul_pd(fscal,dx01);
962             ty               = _mm_mul_pd(fscal,dy01);
963             tz               = _mm_mul_pd(fscal,dz01);
964
965             /* Update vectorial force */
966             fix0             = _mm_add_pd(fix0,tx);
967             fiy0             = _mm_add_pd(fiy0,ty);
968             fiz0             = _mm_add_pd(fiz0,tz);
969
970             fjx1             = _mm_add_pd(fjx1,tx);
971             fjy1             = _mm_add_pd(fjy1,ty);
972             fjz1             = _mm_add_pd(fjz1,tz);
973
974             }
975
976             /**************************
977              * CALCULATE INTERACTIONS *
978              **************************/
979
980             if (gmx_mm_any_lt(rsq02,rcutoff2))
981             {
982
983             r02              = _mm_mul_pd(rsq02,rinv02);
984
985             /* EWALD ELECTROSTATICS */
986
987             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988             ewrt             = _mm_mul_pd(r02,ewtabscale);
989             ewitab           = _mm_cvttpd_epi32(ewrt);
990             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
991             ewitab           = _mm_slli_epi32(ewitab,2);
992             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
993             ewtabD           = _mm_setzero_pd();
994             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
995             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
996             ewtabFn          = _mm_setzero_pd();
997             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
998             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
999             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1000             velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
1001             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1002
1003             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1004
1005             /* Update potential sum for this i atom from the interaction with this j atom. */
1006             velec            = _mm_and_pd(velec,cutoff_mask);
1007             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1008             velecsum         = _mm_add_pd(velecsum,velec);
1009
1010             fscal            = felec;
1011
1012             fscal            = _mm_and_pd(fscal,cutoff_mask);
1013
1014             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1015
1016             /* Calculate temporary vectorial force */
1017             tx               = _mm_mul_pd(fscal,dx02);
1018             ty               = _mm_mul_pd(fscal,dy02);
1019             tz               = _mm_mul_pd(fscal,dz02);
1020
1021             /* Update vectorial force */
1022             fix0             = _mm_add_pd(fix0,tx);
1023             fiy0             = _mm_add_pd(fiy0,ty);
1024             fiz0             = _mm_add_pd(fiz0,tz);
1025
1026             fjx2             = _mm_add_pd(fjx2,tx);
1027             fjy2             = _mm_add_pd(fjy2,ty);
1028             fjz2             = _mm_add_pd(fjz2,tz);
1029
1030             }
1031
1032             /**************************
1033              * CALCULATE INTERACTIONS *
1034              **************************/
1035
1036             if (gmx_mm_any_lt(rsq10,rcutoff2))
1037             {
1038
1039             r10              = _mm_mul_pd(rsq10,rinv10);
1040
1041             /* EWALD ELECTROSTATICS */
1042
1043             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1044             ewrt             = _mm_mul_pd(r10,ewtabscale);
1045             ewitab           = _mm_cvttpd_epi32(ewrt);
1046             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1047             ewitab           = _mm_slli_epi32(ewitab,2);
1048             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1049             ewtabD           = _mm_setzero_pd();
1050             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1051             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1052             ewtabFn          = _mm_setzero_pd();
1053             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1054             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1055             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1056             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
1057             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1058
1059             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1060
1061             /* Update potential sum for this i atom from the interaction with this j atom. */
1062             velec            = _mm_and_pd(velec,cutoff_mask);
1063             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1064             velecsum         = _mm_add_pd(velecsum,velec);
1065
1066             fscal            = felec;
1067
1068             fscal            = _mm_and_pd(fscal,cutoff_mask);
1069
1070             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1071
1072             /* Calculate temporary vectorial force */
1073             tx               = _mm_mul_pd(fscal,dx10);
1074             ty               = _mm_mul_pd(fscal,dy10);
1075             tz               = _mm_mul_pd(fscal,dz10);
1076
1077             /* Update vectorial force */
1078             fix1             = _mm_add_pd(fix1,tx);
1079             fiy1             = _mm_add_pd(fiy1,ty);
1080             fiz1             = _mm_add_pd(fiz1,tz);
1081
1082             fjx0             = _mm_add_pd(fjx0,tx);
1083             fjy0             = _mm_add_pd(fjy0,ty);
1084             fjz0             = _mm_add_pd(fjz0,tz);
1085
1086             }
1087
1088             /**************************
1089              * CALCULATE INTERACTIONS *
1090              **************************/
1091
1092             if (gmx_mm_any_lt(rsq11,rcutoff2))
1093             {
1094
1095             r11              = _mm_mul_pd(rsq11,rinv11);
1096
1097             /* EWALD ELECTROSTATICS */
1098
1099             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1100             ewrt             = _mm_mul_pd(r11,ewtabscale);
1101             ewitab           = _mm_cvttpd_epi32(ewrt);
1102             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1103             ewitab           = _mm_slli_epi32(ewitab,2);
1104             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1105             ewtabD           = _mm_setzero_pd();
1106             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1107             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1108             ewtabFn          = _mm_setzero_pd();
1109             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1110             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1111             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1112             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
1113             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1114
1115             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1116
1117             /* Update potential sum for this i atom from the interaction with this j atom. */
1118             velec            = _mm_and_pd(velec,cutoff_mask);
1119             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1120             velecsum         = _mm_add_pd(velecsum,velec);
1121
1122             fscal            = felec;
1123
1124             fscal            = _mm_and_pd(fscal,cutoff_mask);
1125
1126             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1127
1128             /* Calculate temporary vectorial force */
1129             tx               = _mm_mul_pd(fscal,dx11);
1130             ty               = _mm_mul_pd(fscal,dy11);
1131             tz               = _mm_mul_pd(fscal,dz11);
1132
1133             /* Update vectorial force */
1134             fix1             = _mm_add_pd(fix1,tx);
1135             fiy1             = _mm_add_pd(fiy1,ty);
1136             fiz1             = _mm_add_pd(fiz1,tz);
1137
1138             fjx1             = _mm_add_pd(fjx1,tx);
1139             fjy1             = _mm_add_pd(fjy1,ty);
1140             fjz1             = _mm_add_pd(fjz1,tz);
1141
1142             }
1143
1144             /**************************
1145              * CALCULATE INTERACTIONS *
1146              **************************/
1147
1148             if (gmx_mm_any_lt(rsq12,rcutoff2))
1149             {
1150
1151             r12              = _mm_mul_pd(rsq12,rinv12);
1152
1153             /* EWALD ELECTROSTATICS */
1154
1155             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1156             ewrt             = _mm_mul_pd(r12,ewtabscale);
1157             ewitab           = _mm_cvttpd_epi32(ewrt);
1158             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1159             ewitab           = _mm_slli_epi32(ewitab,2);
1160             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1161             ewtabD           = _mm_setzero_pd();
1162             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1163             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1164             ewtabFn          = _mm_setzero_pd();
1165             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1166             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1167             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1168             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1169             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1170
1171             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1172
1173             /* Update potential sum for this i atom from the interaction with this j atom. */
1174             velec            = _mm_and_pd(velec,cutoff_mask);
1175             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1176             velecsum         = _mm_add_pd(velecsum,velec);
1177
1178             fscal            = felec;
1179
1180             fscal            = _mm_and_pd(fscal,cutoff_mask);
1181
1182             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1183
1184             /* Calculate temporary vectorial force */
1185             tx               = _mm_mul_pd(fscal,dx12);
1186             ty               = _mm_mul_pd(fscal,dy12);
1187             tz               = _mm_mul_pd(fscal,dz12);
1188
1189             /* Update vectorial force */
1190             fix1             = _mm_add_pd(fix1,tx);
1191             fiy1             = _mm_add_pd(fiy1,ty);
1192             fiz1             = _mm_add_pd(fiz1,tz);
1193
1194             fjx2             = _mm_add_pd(fjx2,tx);
1195             fjy2             = _mm_add_pd(fjy2,ty);
1196             fjz2             = _mm_add_pd(fjz2,tz);
1197
1198             }
1199
1200             /**************************
1201              * CALCULATE INTERACTIONS *
1202              **************************/
1203
1204             if (gmx_mm_any_lt(rsq20,rcutoff2))
1205             {
1206
1207             r20              = _mm_mul_pd(rsq20,rinv20);
1208
1209             /* EWALD ELECTROSTATICS */
1210
1211             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1212             ewrt             = _mm_mul_pd(r20,ewtabscale);
1213             ewitab           = _mm_cvttpd_epi32(ewrt);
1214             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1215             ewitab           = _mm_slli_epi32(ewitab,2);
1216             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1217             ewtabD           = _mm_setzero_pd();
1218             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1219             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1220             ewtabFn          = _mm_setzero_pd();
1221             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1222             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1223             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1224             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
1225             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1226
1227             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1228
1229             /* Update potential sum for this i atom from the interaction with this j atom. */
1230             velec            = _mm_and_pd(velec,cutoff_mask);
1231             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1232             velecsum         = _mm_add_pd(velecsum,velec);
1233
1234             fscal            = felec;
1235
1236             fscal            = _mm_and_pd(fscal,cutoff_mask);
1237
1238             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1239
1240             /* Calculate temporary vectorial force */
1241             tx               = _mm_mul_pd(fscal,dx20);
1242             ty               = _mm_mul_pd(fscal,dy20);
1243             tz               = _mm_mul_pd(fscal,dz20);
1244
1245             /* Update vectorial force */
1246             fix2             = _mm_add_pd(fix2,tx);
1247             fiy2             = _mm_add_pd(fiy2,ty);
1248             fiz2             = _mm_add_pd(fiz2,tz);
1249
1250             fjx0             = _mm_add_pd(fjx0,tx);
1251             fjy0             = _mm_add_pd(fjy0,ty);
1252             fjz0             = _mm_add_pd(fjz0,tz);
1253
1254             }
1255
1256             /**************************
1257              * CALCULATE INTERACTIONS *
1258              **************************/
1259
1260             if (gmx_mm_any_lt(rsq21,rcutoff2))
1261             {
1262
1263             r21              = _mm_mul_pd(rsq21,rinv21);
1264
1265             /* EWALD ELECTROSTATICS */
1266
1267             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268             ewrt             = _mm_mul_pd(r21,ewtabscale);
1269             ewitab           = _mm_cvttpd_epi32(ewrt);
1270             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1271             ewitab           = _mm_slli_epi32(ewitab,2);
1272             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1273             ewtabD           = _mm_setzero_pd();
1274             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1275             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1276             ewtabFn          = _mm_setzero_pd();
1277             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1278             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1279             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1280             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1281             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1282
1283             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1284
1285             /* Update potential sum for this i atom from the interaction with this j atom. */
1286             velec            = _mm_and_pd(velec,cutoff_mask);
1287             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1288             velecsum         = _mm_add_pd(velecsum,velec);
1289
1290             fscal            = felec;
1291
1292             fscal            = _mm_and_pd(fscal,cutoff_mask);
1293
1294             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1295
1296             /* Calculate temporary vectorial force */
1297             tx               = _mm_mul_pd(fscal,dx21);
1298             ty               = _mm_mul_pd(fscal,dy21);
1299             tz               = _mm_mul_pd(fscal,dz21);
1300
1301             /* Update vectorial force */
1302             fix2             = _mm_add_pd(fix2,tx);
1303             fiy2             = _mm_add_pd(fiy2,ty);
1304             fiz2             = _mm_add_pd(fiz2,tz);
1305
1306             fjx1             = _mm_add_pd(fjx1,tx);
1307             fjy1             = _mm_add_pd(fjy1,ty);
1308             fjz1             = _mm_add_pd(fjz1,tz);
1309
1310             }
1311
1312             /**************************
1313              * CALCULATE INTERACTIONS *
1314              **************************/
1315
1316             if (gmx_mm_any_lt(rsq22,rcutoff2))
1317             {
1318
1319             r22              = _mm_mul_pd(rsq22,rinv22);
1320
1321             /* EWALD ELECTROSTATICS */
1322
1323             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1324             ewrt             = _mm_mul_pd(r22,ewtabscale);
1325             ewitab           = _mm_cvttpd_epi32(ewrt);
1326             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1327             ewitab           = _mm_slli_epi32(ewitab,2);
1328             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1329             ewtabD           = _mm_setzero_pd();
1330             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1331             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1332             ewtabFn          = _mm_setzero_pd();
1333             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1334             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1335             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1336             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1337             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1338
1339             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1340
1341             /* Update potential sum for this i atom from the interaction with this j atom. */
1342             velec            = _mm_and_pd(velec,cutoff_mask);
1343             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1344             velecsum         = _mm_add_pd(velecsum,velec);
1345
1346             fscal            = felec;
1347
1348             fscal            = _mm_and_pd(fscal,cutoff_mask);
1349
1350             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1351
1352             /* Calculate temporary vectorial force */
1353             tx               = _mm_mul_pd(fscal,dx22);
1354             ty               = _mm_mul_pd(fscal,dy22);
1355             tz               = _mm_mul_pd(fscal,dz22);
1356
1357             /* Update vectorial force */
1358             fix2             = _mm_add_pd(fix2,tx);
1359             fiy2             = _mm_add_pd(fiy2,ty);
1360             fiz2             = _mm_add_pd(fiz2,tz);
1361
1362             fjx2             = _mm_add_pd(fjx2,tx);
1363             fjy2             = _mm_add_pd(fjy2,ty);
1364             fjz2             = _mm_add_pd(fjz2,tz);
1365
1366             }
1367
1368             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1369
1370             /* Inner loop uses 432 flops */
1371         }
1372
1373         /* End of innermost loop */
1374
1375         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1376                                               f+i_coord_offset,fshift+i_shift_offset);
1377
1378         ggid                        = gid[iidx];
1379         /* Update potential energies */
1380         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1381         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1382
1383         /* Increment number of inner iterations */
1384         inneriter                  += j_index_end - j_index_start;
1385
1386         /* Outer loop uses 20 flops */
1387     }
1388
1389     /* Increment number of outer iterations */
1390     outeriter        += nri;
1391
1392     /* Update outer/inner flops */
1393
1394     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*432);
1395 }
1396 /*
1397  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double
1398  * Electrostatics interaction: Ewald
1399  * VdW interaction:            LennardJones
1400  * Geometry:                   Water3-Water3
1401  * Calculate force/pot:        Force
1402  */
1403 void
1404 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double
1405                     (t_nblist * gmx_restrict                nlist,
1406                      rvec * gmx_restrict                    xx,
1407                      rvec * gmx_restrict                    ff,
1408                      t_forcerec * gmx_restrict              fr,
1409                      t_mdatoms * gmx_restrict               mdatoms,
1410                      nb_kernel_data_t * gmx_restrict        kernel_data,
1411                      t_nrnb * gmx_restrict                  nrnb)
1412 {
1413     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1414      * just 0 for non-waters.
1415      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1416      * jnr indices corresponding to data put in the four positions in the SIMD register.
1417      */
1418     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1419     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1420     int              jnrA,jnrB;
1421     int              j_coord_offsetA,j_coord_offsetB;
1422     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1423     real             rcutoff_scalar;
1424     real             *shiftvec,*fshift,*x,*f;
1425     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1426     int              vdwioffset0;
1427     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1428     int              vdwioffset1;
1429     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1430     int              vdwioffset2;
1431     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1432     int              vdwjidx0A,vdwjidx0B;
1433     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1434     int              vdwjidx1A,vdwjidx1B;
1435     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1436     int              vdwjidx2A,vdwjidx2B;
1437     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1438     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1439     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1440     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1441     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1442     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1443     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1444     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1445     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1446     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1447     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1448     real             *charge;
1449     int              nvdwtype;
1450     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1451     int              *vdwtype;
1452     real             *vdwparam;
1453     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1454     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1455     __m128i          ewitab;
1456     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1457     real             *ewtab;
1458     __m128d          dummy_mask,cutoff_mask;
1459     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1460     __m128d          one     = _mm_set1_pd(1.0);
1461     __m128d          two     = _mm_set1_pd(2.0);
1462     x                = xx[0];
1463     f                = ff[0];
1464
1465     nri              = nlist->nri;
1466     iinr             = nlist->iinr;
1467     jindex           = nlist->jindex;
1468     jjnr             = nlist->jjnr;
1469     shiftidx         = nlist->shift;
1470     gid              = nlist->gid;
1471     shiftvec         = fr->shift_vec[0];
1472     fshift           = fr->fshift[0];
1473     facel            = _mm_set1_pd(fr->epsfac);
1474     charge           = mdatoms->chargeA;
1475     nvdwtype         = fr->ntype;
1476     vdwparam         = fr->nbfp;
1477     vdwtype          = mdatoms->typeA;
1478
1479     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1480     ewtab            = fr->ic->tabq_coul_F;
1481     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1482     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1483
1484     /* Setup water-specific parameters */
1485     inr              = nlist->iinr[0];
1486     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1487     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1488     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1489     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1490
1491     jq0              = _mm_set1_pd(charge[inr+0]);
1492     jq1              = _mm_set1_pd(charge[inr+1]);
1493     jq2              = _mm_set1_pd(charge[inr+2]);
1494     vdwjidx0A        = 2*vdwtype[inr+0];
1495     qq00             = _mm_mul_pd(iq0,jq0);
1496     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1497     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1498     qq01             = _mm_mul_pd(iq0,jq1);
1499     qq02             = _mm_mul_pd(iq0,jq2);
1500     qq10             = _mm_mul_pd(iq1,jq0);
1501     qq11             = _mm_mul_pd(iq1,jq1);
1502     qq12             = _mm_mul_pd(iq1,jq2);
1503     qq20             = _mm_mul_pd(iq2,jq0);
1504     qq21             = _mm_mul_pd(iq2,jq1);
1505     qq22             = _mm_mul_pd(iq2,jq2);
1506
1507     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1508     rcutoff_scalar   = fr->rcoulomb;
1509     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1510     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1511
1512     sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
1513     rvdw             = _mm_set1_pd(fr->rvdw);
1514
1515     /* Avoid stupid compiler warnings */
1516     jnrA = jnrB = 0;
1517     j_coord_offsetA = 0;
1518     j_coord_offsetB = 0;
1519
1520     outeriter        = 0;
1521     inneriter        = 0;
1522
1523     /* Start outer loop over neighborlists */
1524     for(iidx=0; iidx<nri; iidx++)
1525     {
1526         /* Load shift vector for this list */
1527         i_shift_offset   = DIM*shiftidx[iidx];
1528
1529         /* Load limits for loop over neighbors */
1530         j_index_start    = jindex[iidx];
1531         j_index_end      = jindex[iidx+1];
1532
1533         /* Get outer coordinate index */
1534         inr              = iinr[iidx];
1535         i_coord_offset   = DIM*inr;
1536
1537         /* Load i particle coords and add shift vector */
1538         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1539                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1540
1541         fix0             = _mm_setzero_pd();
1542         fiy0             = _mm_setzero_pd();
1543         fiz0             = _mm_setzero_pd();
1544         fix1             = _mm_setzero_pd();
1545         fiy1             = _mm_setzero_pd();
1546         fiz1             = _mm_setzero_pd();
1547         fix2             = _mm_setzero_pd();
1548         fiy2             = _mm_setzero_pd();
1549         fiz2             = _mm_setzero_pd();
1550
1551         /* Start inner kernel loop */
1552         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1553         {
1554
1555             /* Get j neighbor index, and coordinate index */
1556             jnrA             = jjnr[jidx];
1557             jnrB             = jjnr[jidx+1];
1558             j_coord_offsetA  = DIM*jnrA;
1559             j_coord_offsetB  = DIM*jnrB;
1560             
1561             /* load j atom coordinates */
1562             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1563                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1564             
1565             /* Calculate displacement vector */
1566             dx00             = _mm_sub_pd(ix0,jx0);
1567             dy00             = _mm_sub_pd(iy0,jy0);
1568             dz00             = _mm_sub_pd(iz0,jz0);
1569             dx01             = _mm_sub_pd(ix0,jx1);
1570             dy01             = _mm_sub_pd(iy0,jy1);
1571             dz01             = _mm_sub_pd(iz0,jz1);
1572             dx02             = _mm_sub_pd(ix0,jx2);
1573             dy02             = _mm_sub_pd(iy0,jy2);
1574             dz02             = _mm_sub_pd(iz0,jz2);
1575             dx10             = _mm_sub_pd(ix1,jx0);
1576             dy10             = _mm_sub_pd(iy1,jy0);
1577             dz10             = _mm_sub_pd(iz1,jz0);
1578             dx11             = _mm_sub_pd(ix1,jx1);
1579             dy11             = _mm_sub_pd(iy1,jy1);
1580             dz11             = _mm_sub_pd(iz1,jz1);
1581             dx12             = _mm_sub_pd(ix1,jx2);
1582             dy12             = _mm_sub_pd(iy1,jy2);
1583             dz12             = _mm_sub_pd(iz1,jz2);
1584             dx20             = _mm_sub_pd(ix2,jx0);
1585             dy20             = _mm_sub_pd(iy2,jy0);
1586             dz20             = _mm_sub_pd(iz2,jz0);
1587             dx21             = _mm_sub_pd(ix2,jx1);
1588             dy21             = _mm_sub_pd(iy2,jy1);
1589             dz21             = _mm_sub_pd(iz2,jz1);
1590             dx22             = _mm_sub_pd(ix2,jx2);
1591             dy22             = _mm_sub_pd(iy2,jy2);
1592             dz22             = _mm_sub_pd(iz2,jz2);
1593
1594             /* Calculate squared distance and things based on it */
1595             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1596             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1597             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1598             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1599             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1600             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1601             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1602             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1603             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1604
1605             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1606             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1607             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1608             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1609             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1610             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1611             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1612             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1613             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1614
1615             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1616             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1617             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1618             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1619             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1620             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1621             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1622             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1623             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1624
1625             fjx0             = _mm_setzero_pd();
1626             fjy0             = _mm_setzero_pd();
1627             fjz0             = _mm_setzero_pd();
1628             fjx1             = _mm_setzero_pd();
1629             fjy1             = _mm_setzero_pd();
1630             fjz1             = _mm_setzero_pd();
1631             fjx2             = _mm_setzero_pd();
1632             fjy2             = _mm_setzero_pd();
1633             fjz2             = _mm_setzero_pd();
1634
1635             /**************************
1636              * CALCULATE INTERACTIONS *
1637              **************************/
1638
1639             if (gmx_mm_any_lt(rsq00,rcutoff2))
1640             {
1641
1642             r00              = _mm_mul_pd(rsq00,rinv00);
1643
1644             /* EWALD ELECTROSTATICS */
1645
1646             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1647             ewrt             = _mm_mul_pd(r00,ewtabscale);
1648             ewitab           = _mm_cvttpd_epi32(ewrt);
1649             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1650             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1651                                          &ewtabF,&ewtabFn);
1652             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1653             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1654
1655             /* LENNARD-JONES DISPERSION/REPULSION */
1656
1657             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1658             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1659
1660             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1661
1662             fscal            = _mm_add_pd(felec,fvdw);
1663
1664             fscal            = _mm_and_pd(fscal,cutoff_mask);
1665
1666             /* Calculate temporary vectorial force */
1667             tx               = _mm_mul_pd(fscal,dx00);
1668             ty               = _mm_mul_pd(fscal,dy00);
1669             tz               = _mm_mul_pd(fscal,dz00);
1670
1671             /* Update vectorial force */
1672             fix0             = _mm_add_pd(fix0,tx);
1673             fiy0             = _mm_add_pd(fiy0,ty);
1674             fiz0             = _mm_add_pd(fiz0,tz);
1675
1676             fjx0             = _mm_add_pd(fjx0,tx);
1677             fjy0             = _mm_add_pd(fjy0,ty);
1678             fjz0             = _mm_add_pd(fjz0,tz);
1679
1680             }
1681
1682             /**************************
1683              * CALCULATE INTERACTIONS *
1684              **************************/
1685
1686             if (gmx_mm_any_lt(rsq01,rcutoff2))
1687             {
1688
1689             r01              = _mm_mul_pd(rsq01,rinv01);
1690
1691             /* EWALD ELECTROSTATICS */
1692
1693             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1694             ewrt             = _mm_mul_pd(r01,ewtabscale);
1695             ewitab           = _mm_cvttpd_epi32(ewrt);
1696             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1697             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1698                                          &ewtabF,&ewtabFn);
1699             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1700             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1701
1702             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1703
1704             fscal            = felec;
1705
1706             fscal            = _mm_and_pd(fscal,cutoff_mask);
1707
1708             /* Calculate temporary vectorial force */
1709             tx               = _mm_mul_pd(fscal,dx01);
1710             ty               = _mm_mul_pd(fscal,dy01);
1711             tz               = _mm_mul_pd(fscal,dz01);
1712
1713             /* Update vectorial force */
1714             fix0             = _mm_add_pd(fix0,tx);
1715             fiy0             = _mm_add_pd(fiy0,ty);
1716             fiz0             = _mm_add_pd(fiz0,tz);
1717
1718             fjx1             = _mm_add_pd(fjx1,tx);
1719             fjy1             = _mm_add_pd(fjy1,ty);
1720             fjz1             = _mm_add_pd(fjz1,tz);
1721
1722             }
1723
1724             /**************************
1725              * CALCULATE INTERACTIONS *
1726              **************************/
1727
1728             if (gmx_mm_any_lt(rsq02,rcutoff2))
1729             {
1730
1731             r02              = _mm_mul_pd(rsq02,rinv02);
1732
1733             /* EWALD ELECTROSTATICS */
1734
1735             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1736             ewrt             = _mm_mul_pd(r02,ewtabscale);
1737             ewitab           = _mm_cvttpd_epi32(ewrt);
1738             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1739             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1740                                          &ewtabF,&ewtabFn);
1741             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1742             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1743
1744             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1745
1746             fscal            = felec;
1747
1748             fscal            = _mm_and_pd(fscal,cutoff_mask);
1749
1750             /* Calculate temporary vectorial force */
1751             tx               = _mm_mul_pd(fscal,dx02);
1752             ty               = _mm_mul_pd(fscal,dy02);
1753             tz               = _mm_mul_pd(fscal,dz02);
1754
1755             /* Update vectorial force */
1756             fix0             = _mm_add_pd(fix0,tx);
1757             fiy0             = _mm_add_pd(fiy0,ty);
1758             fiz0             = _mm_add_pd(fiz0,tz);
1759
1760             fjx2             = _mm_add_pd(fjx2,tx);
1761             fjy2             = _mm_add_pd(fjy2,ty);
1762             fjz2             = _mm_add_pd(fjz2,tz);
1763
1764             }
1765
1766             /**************************
1767              * CALCULATE INTERACTIONS *
1768              **************************/
1769
1770             if (gmx_mm_any_lt(rsq10,rcutoff2))
1771             {
1772
1773             r10              = _mm_mul_pd(rsq10,rinv10);
1774
1775             /* EWALD ELECTROSTATICS */
1776
1777             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778             ewrt             = _mm_mul_pd(r10,ewtabscale);
1779             ewitab           = _mm_cvttpd_epi32(ewrt);
1780             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1781             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1782                                          &ewtabF,&ewtabFn);
1783             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1784             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1785
1786             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1787
1788             fscal            = felec;
1789
1790             fscal            = _mm_and_pd(fscal,cutoff_mask);
1791
1792             /* Calculate temporary vectorial force */
1793             tx               = _mm_mul_pd(fscal,dx10);
1794             ty               = _mm_mul_pd(fscal,dy10);
1795             tz               = _mm_mul_pd(fscal,dz10);
1796
1797             /* Update vectorial force */
1798             fix1             = _mm_add_pd(fix1,tx);
1799             fiy1             = _mm_add_pd(fiy1,ty);
1800             fiz1             = _mm_add_pd(fiz1,tz);
1801
1802             fjx0             = _mm_add_pd(fjx0,tx);
1803             fjy0             = _mm_add_pd(fjy0,ty);
1804             fjz0             = _mm_add_pd(fjz0,tz);
1805
1806             }
1807
1808             /**************************
1809              * CALCULATE INTERACTIONS *
1810              **************************/
1811
1812             if (gmx_mm_any_lt(rsq11,rcutoff2))
1813             {
1814
1815             r11              = _mm_mul_pd(rsq11,rinv11);
1816
1817             /* EWALD ELECTROSTATICS */
1818
1819             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1820             ewrt             = _mm_mul_pd(r11,ewtabscale);
1821             ewitab           = _mm_cvttpd_epi32(ewrt);
1822             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1823             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1824                                          &ewtabF,&ewtabFn);
1825             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1826             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1827
1828             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1829
1830             fscal            = felec;
1831
1832             fscal            = _mm_and_pd(fscal,cutoff_mask);
1833
1834             /* Calculate temporary vectorial force */
1835             tx               = _mm_mul_pd(fscal,dx11);
1836             ty               = _mm_mul_pd(fscal,dy11);
1837             tz               = _mm_mul_pd(fscal,dz11);
1838
1839             /* Update vectorial force */
1840             fix1             = _mm_add_pd(fix1,tx);
1841             fiy1             = _mm_add_pd(fiy1,ty);
1842             fiz1             = _mm_add_pd(fiz1,tz);
1843
1844             fjx1             = _mm_add_pd(fjx1,tx);
1845             fjy1             = _mm_add_pd(fjy1,ty);
1846             fjz1             = _mm_add_pd(fjz1,tz);
1847
1848             }
1849
1850             /**************************
1851              * CALCULATE INTERACTIONS *
1852              **************************/
1853
1854             if (gmx_mm_any_lt(rsq12,rcutoff2))
1855             {
1856
1857             r12              = _mm_mul_pd(rsq12,rinv12);
1858
1859             /* EWALD ELECTROSTATICS */
1860
1861             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862             ewrt             = _mm_mul_pd(r12,ewtabscale);
1863             ewitab           = _mm_cvttpd_epi32(ewrt);
1864             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1865             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1866                                          &ewtabF,&ewtabFn);
1867             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1868             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1869
1870             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1871
1872             fscal            = felec;
1873
1874             fscal            = _mm_and_pd(fscal,cutoff_mask);
1875
1876             /* Calculate temporary vectorial force */
1877             tx               = _mm_mul_pd(fscal,dx12);
1878             ty               = _mm_mul_pd(fscal,dy12);
1879             tz               = _mm_mul_pd(fscal,dz12);
1880
1881             /* Update vectorial force */
1882             fix1             = _mm_add_pd(fix1,tx);
1883             fiy1             = _mm_add_pd(fiy1,ty);
1884             fiz1             = _mm_add_pd(fiz1,tz);
1885
1886             fjx2             = _mm_add_pd(fjx2,tx);
1887             fjy2             = _mm_add_pd(fjy2,ty);
1888             fjz2             = _mm_add_pd(fjz2,tz);
1889
1890             }
1891
1892             /**************************
1893              * CALCULATE INTERACTIONS *
1894              **************************/
1895
1896             if (gmx_mm_any_lt(rsq20,rcutoff2))
1897             {
1898
1899             r20              = _mm_mul_pd(rsq20,rinv20);
1900
1901             /* EWALD ELECTROSTATICS */
1902
1903             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1904             ewrt             = _mm_mul_pd(r20,ewtabscale);
1905             ewitab           = _mm_cvttpd_epi32(ewrt);
1906             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1907             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1908                                          &ewtabF,&ewtabFn);
1909             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1910             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1911
1912             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1913
1914             fscal            = felec;
1915
1916             fscal            = _mm_and_pd(fscal,cutoff_mask);
1917
1918             /* Calculate temporary vectorial force */
1919             tx               = _mm_mul_pd(fscal,dx20);
1920             ty               = _mm_mul_pd(fscal,dy20);
1921             tz               = _mm_mul_pd(fscal,dz20);
1922
1923             /* Update vectorial force */
1924             fix2             = _mm_add_pd(fix2,tx);
1925             fiy2             = _mm_add_pd(fiy2,ty);
1926             fiz2             = _mm_add_pd(fiz2,tz);
1927
1928             fjx0             = _mm_add_pd(fjx0,tx);
1929             fjy0             = _mm_add_pd(fjy0,ty);
1930             fjz0             = _mm_add_pd(fjz0,tz);
1931
1932             }
1933
1934             /**************************
1935              * CALCULATE INTERACTIONS *
1936              **************************/
1937
1938             if (gmx_mm_any_lt(rsq21,rcutoff2))
1939             {
1940
1941             r21              = _mm_mul_pd(rsq21,rinv21);
1942
1943             /* EWALD ELECTROSTATICS */
1944
1945             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1946             ewrt             = _mm_mul_pd(r21,ewtabscale);
1947             ewitab           = _mm_cvttpd_epi32(ewrt);
1948             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1949             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1950                                          &ewtabF,&ewtabFn);
1951             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1952             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1953
1954             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1955
1956             fscal            = felec;
1957
1958             fscal            = _mm_and_pd(fscal,cutoff_mask);
1959
1960             /* Calculate temporary vectorial force */
1961             tx               = _mm_mul_pd(fscal,dx21);
1962             ty               = _mm_mul_pd(fscal,dy21);
1963             tz               = _mm_mul_pd(fscal,dz21);
1964
1965             /* Update vectorial force */
1966             fix2             = _mm_add_pd(fix2,tx);
1967             fiy2             = _mm_add_pd(fiy2,ty);
1968             fiz2             = _mm_add_pd(fiz2,tz);
1969
1970             fjx1             = _mm_add_pd(fjx1,tx);
1971             fjy1             = _mm_add_pd(fjy1,ty);
1972             fjz1             = _mm_add_pd(fjz1,tz);
1973
1974             }
1975
1976             /**************************
1977              * CALCULATE INTERACTIONS *
1978              **************************/
1979
1980             if (gmx_mm_any_lt(rsq22,rcutoff2))
1981             {
1982
1983             r22              = _mm_mul_pd(rsq22,rinv22);
1984
1985             /* EWALD ELECTROSTATICS */
1986
1987             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1988             ewrt             = _mm_mul_pd(r22,ewtabscale);
1989             ewitab           = _mm_cvttpd_epi32(ewrt);
1990             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1991             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1992                                          &ewtabF,&ewtabFn);
1993             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1994             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1995
1996             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1997
1998             fscal            = felec;
1999
2000             fscal            = _mm_and_pd(fscal,cutoff_mask);
2001
2002             /* Calculate temporary vectorial force */
2003             tx               = _mm_mul_pd(fscal,dx22);
2004             ty               = _mm_mul_pd(fscal,dy22);
2005             tz               = _mm_mul_pd(fscal,dz22);
2006
2007             /* Update vectorial force */
2008             fix2             = _mm_add_pd(fix2,tx);
2009             fiy2             = _mm_add_pd(fiy2,ty);
2010             fiz2             = _mm_add_pd(fiz2,tz);
2011
2012             fjx2             = _mm_add_pd(fjx2,tx);
2013             fjy2             = _mm_add_pd(fjy2,ty);
2014             fjz2             = _mm_add_pd(fjz2,tz);
2015
2016             }
2017
2018             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2019
2020             /* Inner loop uses 358 flops */
2021         }
2022
2023         if(jidx<j_index_end)
2024         {
2025
2026             jnrA             = jjnr[jidx];
2027             j_coord_offsetA  = DIM*jnrA;
2028
2029             /* load j atom coordinates */
2030             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2031                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2032             
2033             /* Calculate displacement vector */
2034             dx00             = _mm_sub_pd(ix0,jx0);
2035             dy00             = _mm_sub_pd(iy0,jy0);
2036             dz00             = _mm_sub_pd(iz0,jz0);
2037             dx01             = _mm_sub_pd(ix0,jx1);
2038             dy01             = _mm_sub_pd(iy0,jy1);
2039             dz01             = _mm_sub_pd(iz0,jz1);
2040             dx02             = _mm_sub_pd(ix0,jx2);
2041             dy02             = _mm_sub_pd(iy0,jy2);
2042             dz02             = _mm_sub_pd(iz0,jz2);
2043             dx10             = _mm_sub_pd(ix1,jx0);
2044             dy10             = _mm_sub_pd(iy1,jy0);
2045             dz10             = _mm_sub_pd(iz1,jz0);
2046             dx11             = _mm_sub_pd(ix1,jx1);
2047             dy11             = _mm_sub_pd(iy1,jy1);
2048             dz11             = _mm_sub_pd(iz1,jz1);
2049             dx12             = _mm_sub_pd(ix1,jx2);
2050             dy12             = _mm_sub_pd(iy1,jy2);
2051             dz12             = _mm_sub_pd(iz1,jz2);
2052             dx20             = _mm_sub_pd(ix2,jx0);
2053             dy20             = _mm_sub_pd(iy2,jy0);
2054             dz20             = _mm_sub_pd(iz2,jz0);
2055             dx21             = _mm_sub_pd(ix2,jx1);
2056             dy21             = _mm_sub_pd(iy2,jy1);
2057             dz21             = _mm_sub_pd(iz2,jz1);
2058             dx22             = _mm_sub_pd(ix2,jx2);
2059             dy22             = _mm_sub_pd(iy2,jy2);
2060             dz22             = _mm_sub_pd(iz2,jz2);
2061
2062             /* Calculate squared distance and things based on it */
2063             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2064             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2065             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2066             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2067             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2068             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2069             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2070             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2071             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2072
2073             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2074             rinv01           = gmx_mm_invsqrt_pd(rsq01);
2075             rinv02           = gmx_mm_invsqrt_pd(rsq02);
2076             rinv10           = gmx_mm_invsqrt_pd(rsq10);
2077             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2078             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2079             rinv20           = gmx_mm_invsqrt_pd(rsq20);
2080             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2081             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2082
2083             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2084             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
2085             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
2086             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
2087             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2088             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2089             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
2090             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2091             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2092
2093             fjx0             = _mm_setzero_pd();
2094             fjy0             = _mm_setzero_pd();
2095             fjz0             = _mm_setzero_pd();
2096             fjx1             = _mm_setzero_pd();
2097             fjy1             = _mm_setzero_pd();
2098             fjz1             = _mm_setzero_pd();
2099             fjx2             = _mm_setzero_pd();
2100             fjy2             = _mm_setzero_pd();
2101             fjz2             = _mm_setzero_pd();
2102
2103             /**************************
2104              * CALCULATE INTERACTIONS *
2105              **************************/
2106
2107             if (gmx_mm_any_lt(rsq00,rcutoff2))
2108             {
2109
2110             r00              = _mm_mul_pd(rsq00,rinv00);
2111
2112             /* EWALD ELECTROSTATICS */
2113
2114             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2115             ewrt             = _mm_mul_pd(r00,ewtabscale);
2116             ewitab           = _mm_cvttpd_epi32(ewrt);
2117             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2118             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2119             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2120             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2121
2122             /* LENNARD-JONES DISPERSION/REPULSION */
2123
2124             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2125             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2126
2127             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2128
2129             fscal            = _mm_add_pd(felec,fvdw);
2130
2131             fscal            = _mm_and_pd(fscal,cutoff_mask);
2132
2133             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2134
2135             /* Calculate temporary vectorial force */
2136             tx               = _mm_mul_pd(fscal,dx00);
2137             ty               = _mm_mul_pd(fscal,dy00);
2138             tz               = _mm_mul_pd(fscal,dz00);
2139
2140             /* Update vectorial force */
2141             fix0             = _mm_add_pd(fix0,tx);
2142             fiy0             = _mm_add_pd(fiy0,ty);
2143             fiz0             = _mm_add_pd(fiz0,tz);
2144
2145             fjx0             = _mm_add_pd(fjx0,tx);
2146             fjy0             = _mm_add_pd(fjy0,ty);
2147             fjz0             = _mm_add_pd(fjz0,tz);
2148
2149             }
2150
2151             /**************************
2152              * CALCULATE INTERACTIONS *
2153              **************************/
2154
2155             if (gmx_mm_any_lt(rsq01,rcutoff2))
2156             {
2157
2158             r01              = _mm_mul_pd(rsq01,rinv01);
2159
2160             /* EWALD ELECTROSTATICS */
2161
2162             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2163             ewrt             = _mm_mul_pd(r01,ewtabscale);
2164             ewitab           = _mm_cvttpd_epi32(ewrt);
2165             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2166             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2167             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2168             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2169
2170             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
2171
2172             fscal            = felec;
2173
2174             fscal            = _mm_and_pd(fscal,cutoff_mask);
2175
2176             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2177
2178             /* Calculate temporary vectorial force */
2179             tx               = _mm_mul_pd(fscal,dx01);
2180             ty               = _mm_mul_pd(fscal,dy01);
2181             tz               = _mm_mul_pd(fscal,dz01);
2182
2183             /* Update vectorial force */
2184             fix0             = _mm_add_pd(fix0,tx);
2185             fiy0             = _mm_add_pd(fiy0,ty);
2186             fiz0             = _mm_add_pd(fiz0,tz);
2187
2188             fjx1             = _mm_add_pd(fjx1,tx);
2189             fjy1             = _mm_add_pd(fjy1,ty);
2190             fjz1             = _mm_add_pd(fjz1,tz);
2191
2192             }
2193
2194             /**************************
2195              * CALCULATE INTERACTIONS *
2196              **************************/
2197
2198             if (gmx_mm_any_lt(rsq02,rcutoff2))
2199             {
2200
2201             r02              = _mm_mul_pd(rsq02,rinv02);
2202
2203             /* EWALD ELECTROSTATICS */
2204
2205             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2206             ewrt             = _mm_mul_pd(r02,ewtabscale);
2207             ewitab           = _mm_cvttpd_epi32(ewrt);
2208             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2209             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2210             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2211             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2212
2213             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
2214
2215             fscal            = felec;
2216
2217             fscal            = _mm_and_pd(fscal,cutoff_mask);
2218
2219             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2220
2221             /* Calculate temporary vectorial force */
2222             tx               = _mm_mul_pd(fscal,dx02);
2223             ty               = _mm_mul_pd(fscal,dy02);
2224             tz               = _mm_mul_pd(fscal,dz02);
2225
2226             /* Update vectorial force */
2227             fix0             = _mm_add_pd(fix0,tx);
2228             fiy0             = _mm_add_pd(fiy0,ty);
2229             fiz0             = _mm_add_pd(fiz0,tz);
2230
2231             fjx2             = _mm_add_pd(fjx2,tx);
2232             fjy2             = _mm_add_pd(fjy2,ty);
2233             fjz2             = _mm_add_pd(fjz2,tz);
2234
2235             }
2236
2237             /**************************
2238              * CALCULATE INTERACTIONS *
2239              **************************/
2240
2241             if (gmx_mm_any_lt(rsq10,rcutoff2))
2242             {
2243
2244             r10              = _mm_mul_pd(rsq10,rinv10);
2245
2246             /* EWALD ELECTROSTATICS */
2247
2248             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2249             ewrt             = _mm_mul_pd(r10,ewtabscale);
2250             ewitab           = _mm_cvttpd_epi32(ewrt);
2251             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2252             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2253             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2254             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2255
2256             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
2257
2258             fscal            = felec;
2259
2260             fscal            = _mm_and_pd(fscal,cutoff_mask);
2261
2262             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2263
2264             /* Calculate temporary vectorial force */
2265             tx               = _mm_mul_pd(fscal,dx10);
2266             ty               = _mm_mul_pd(fscal,dy10);
2267             tz               = _mm_mul_pd(fscal,dz10);
2268
2269             /* Update vectorial force */
2270             fix1             = _mm_add_pd(fix1,tx);
2271             fiy1             = _mm_add_pd(fiy1,ty);
2272             fiz1             = _mm_add_pd(fiz1,tz);
2273
2274             fjx0             = _mm_add_pd(fjx0,tx);
2275             fjy0             = _mm_add_pd(fjy0,ty);
2276             fjz0             = _mm_add_pd(fjz0,tz);
2277
2278             }
2279
2280             /**************************
2281              * CALCULATE INTERACTIONS *
2282              **************************/
2283
2284             if (gmx_mm_any_lt(rsq11,rcutoff2))
2285             {
2286
2287             r11              = _mm_mul_pd(rsq11,rinv11);
2288
2289             /* EWALD ELECTROSTATICS */
2290
2291             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2292             ewrt             = _mm_mul_pd(r11,ewtabscale);
2293             ewitab           = _mm_cvttpd_epi32(ewrt);
2294             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2295             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2296             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2297             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2298
2299             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2300
2301             fscal            = felec;
2302
2303             fscal            = _mm_and_pd(fscal,cutoff_mask);
2304
2305             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2306
2307             /* Calculate temporary vectorial force */
2308             tx               = _mm_mul_pd(fscal,dx11);
2309             ty               = _mm_mul_pd(fscal,dy11);
2310             tz               = _mm_mul_pd(fscal,dz11);
2311
2312             /* Update vectorial force */
2313             fix1             = _mm_add_pd(fix1,tx);
2314             fiy1             = _mm_add_pd(fiy1,ty);
2315             fiz1             = _mm_add_pd(fiz1,tz);
2316
2317             fjx1             = _mm_add_pd(fjx1,tx);
2318             fjy1             = _mm_add_pd(fjy1,ty);
2319             fjz1             = _mm_add_pd(fjz1,tz);
2320
2321             }
2322
2323             /**************************
2324              * CALCULATE INTERACTIONS *
2325              **************************/
2326
2327             if (gmx_mm_any_lt(rsq12,rcutoff2))
2328             {
2329
2330             r12              = _mm_mul_pd(rsq12,rinv12);
2331
2332             /* EWALD ELECTROSTATICS */
2333
2334             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335             ewrt             = _mm_mul_pd(r12,ewtabscale);
2336             ewitab           = _mm_cvttpd_epi32(ewrt);
2337             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2338             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2339             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2340             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2341
2342             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2343
2344             fscal            = felec;
2345
2346             fscal            = _mm_and_pd(fscal,cutoff_mask);
2347
2348             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2349
2350             /* Calculate temporary vectorial force */
2351             tx               = _mm_mul_pd(fscal,dx12);
2352             ty               = _mm_mul_pd(fscal,dy12);
2353             tz               = _mm_mul_pd(fscal,dz12);
2354
2355             /* Update vectorial force */
2356             fix1             = _mm_add_pd(fix1,tx);
2357             fiy1             = _mm_add_pd(fiy1,ty);
2358             fiz1             = _mm_add_pd(fiz1,tz);
2359
2360             fjx2             = _mm_add_pd(fjx2,tx);
2361             fjy2             = _mm_add_pd(fjy2,ty);
2362             fjz2             = _mm_add_pd(fjz2,tz);
2363
2364             }
2365
2366             /**************************
2367              * CALCULATE INTERACTIONS *
2368              **************************/
2369
2370             if (gmx_mm_any_lt(rsq20,rcutoff2))
2371             {
2372
2373             r20              = _mm_mul_pd(rsq20,rinv20);
2374
2375             /* EWALD ELECTROSTATICS */
2376
2377             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2378             ewrt             = _mm_mul_pd(r20,ewtabscale);
2379             ewitab           = _mm_cvttpd_epi32(ewrt);
2380             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2381             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2382             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2383             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2384
2385             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
2386
2387             fscal            = felec;
2388
2389             fscal            = _mm_and_pd(fscal,cutoff_mask);
2390
2391             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2392
2393             /* Calculate temporary vectorial force */
2394             tx               = _mm_mul_pd(fscal,dx20);
2395             ty               = _mm_mul_pd(fscal,dy20);
2396             tz               = _mm_mul_pd(fscal,dz20);
2397
2398             /* Update vectorial force */
2399             fix2             = _mm_add_pd(fix2,tx);
2400             fiy2             = _mm_add_pd(fiy2,ty);
2401             fiz2             = _mm_add_pd(fiz2,tz);
2402
2403             fjx0             = _mm_add_pd(fjx0,tx);
2404             fjy0             = _mm_add_pd(fjy0,ty);
2405             fjz0             = _mm_add_pd(fjz0,tz);
2406
2407             }
2408
2409             /**************************
2410              * CALCULATE INTERACTIONS *
2411              **************************/
2412
2413             if (gmx_mm_any_lt(rsq21,rcutoff2))
2414             {
2415
2416             r21              = _mm_mul_pd(rsq21,rinv21);
2417
2418             /* EWALD ELECTROSTATICS */
2419
2420             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2421             ewrt             = _mm_mul_pd(r21,ewtabscale);
2422             ewitab           = _mm_cvttpd_epi32(ewrt);
2423             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2424             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2425             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2426             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2427
2428             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2429
2430             fscal            = felec;
2431
2432             fscal            = _mm_and_pd(fscal,cutoff_mask);
2433
2434             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2435
2436             /* Calculate temporary vectorial force */
2437             tx               = _mm_mul_pd(fscal,dx21);
2438             ty               = _mm_mul_pd(fscal,dy21);
2439             tz               = _mm_mul_pd(fscal,dz21);
2440
2441             /* Update vectorial force */
2442             fix2             = _mm_add_pd(fix2,tx);
2443             fiy2             = _mm_add_pd(fiy2,ty);
2444             fiz2             = _mm_add_pd(fiz2,tz);
2445
2446             fjx1             = _mm_add_pd(fjx1,tx);
2447             fjy1             = _mm_add_pd(fjy1,ty);
2448             fjz1             = _mm_add_pd(fjz1,tz);
2449
2450             }
2451
2452             /**************************
2453              * CALCULATE INTERACTIONS *
2454              **************************/
2455
2456             if (gmx_mm_any_lt(rsq22,rcutoff2))
2457             {
2458
2459             r22              = _mm_mul_pd(rsq22,rinv22);
2460
2461             /* EWALD ELECTROSTATICS */
2462
2463             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2464             ewrt             = _mm_mul_pd(r22,ewtabscale);
2465             ewitab           = _mm_cvttpd_epi32(ewrt);
2466             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2467             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2468             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2469             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2470
2471             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2472
2473             fscal            = felec;
2474
2475             fscal            = _mm_and_pd(fscal,cutoff_mask);
2476
2477             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2478
2479             /* Calculate temporary vectorial force */
2480             tx               = _mm_mul_pd(fscal,dx22);
2481             ty               = _mm_mul_pd(fscal,dy22);
2482             tz               = _mm_mul_pd(fscal,dz22);
2483
2484             /* Update vectorial force */
2485             fix2             = _mm_add_pd(fix2,tx);
2486             fiy2             = _mm_add_pd(fiy2,ty);
2487             fiz2             = _mm_add_pd(fiz2,tz);
2488
2489             fjx2             = _mm_add_pd(fjx2,tx);
2490             fjy2             = _mm_add_pd(fjy2,ty);
2491             fjz2             = _mm_add_pd(fjz2,tz);
2492
2493             }
2494
2495             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2496
2497             /* Inner loop uses 358 flops */
2498         }
2499
2500         /* End of innermost loop */
2501
2502         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2503                                               f+i_coord_offset,fshift+i_shift_offset);
2504
2505         /* Increment number of inner iterations */
2506         inneriter                  += j_index_end - j_index_start;
2507
2508         /* Outer loop uses 18 flops */
2509     }
2510
2511     /* Increment number of outer iterations */
2512     outeriter        += nri;
2513
2514     /* Update outer/inner flops */
2515
2516     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*358);
2517 }