Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sse2_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_double
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            None
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwjidx0A,vdwjidx0B;
86     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87     int              vdwjidx1A,vdwjidx1B;
88     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89     int              vdwjidx2A,vdwjidx2B;
90     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
101     real             *charge;
102     __m128i          ewitab;
103     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
104     real             *ewtab;
105     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
106     real             rswitch_scalar,d_scalar;
107     __m128d          dummy_mask,cutoff_mask;
108     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
109     __m128d          one     = _mm_set1_pd(1.0);
110     __m128d          two     = _mm_set1_pd(2.0);
111     x                = xx[0];
112     f                = ff[0];
113
114     nri              = nlist->nri;
115     iinr             = nlist->iinr;
116     jindex           = nlist->jindex;
117     jjnr             = nlist->jjnr;
118     shiftidx         = nlist->shift;
119     gid              = nlist->gid;
120     shiftvec         = fr->shift_vec[0];
121     fshift           = fr->fshift[0];
122     facel            = _mm_set1_pd(fr->ic->epsfac);
123     charge           = mdatoms->chargeA;
124
125     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
126     ewtab            = fr->ic->tabq_coul_FDV0;
127     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
128     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
129
130     /* Setup water-specific parameters */
131     inr              = nlist->iinr[0];
132     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
133     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
134     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
135
136     jq0              = _mm_set1_pd(charge[inr+0]);
137     jq1              = _mm_set1_pd(charge[inr+1]);
138     jq2              = _mm_set1_pd(charge[inr+2]);
139     qq00             = _mm_mul_pd(iq0,jq0);
140     qq01             = _mm_mul_pd(iq0,jq1);
141     qq02             = _mm_mul_pd(iq0,jq2);
142     qq10             = _mm_mul_pd(iq1,jq0);
143     qq11             = _mm_mul_pd(iq1,jq1);
144     qq12             = _mm_mul_pd(iq1,jq2);
145     qq20             = _mm_mul_pd(iq2,jq0);
146     qq21             = _mm_mul_pd(iq2,jq1);
147     qq22             = _mm_mul_pd(iq2,jq2);
148
149     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150     rcutoff_scalar   = fr->ic->rcoulomb;
151     rcutoff          = _mm_set1_pd(rcutoff_scalar);
152     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
153
154     rswitch_scalar   = fr->ic->rcoulomb_switch;
155     rswitch          = _mm_set1_pd(rswitch_scalar);
156     /* Setup switch parameters */
157     d_scalar         = rcutoff_scalar-rswitch_scalar;
158     d                = _mm_set1_pd(d_scalar);
159     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
160     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
163     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165
166     /* Avoid stupid compiler warnings */
167     jnrA = jnrB = 0;
168     j_coord_offsetA = 0;
169     j_coord_offsetB = 0;
170
171     outeriter        = 0;
172     inneriter        = 0;
173
174     /* Start outer loop over neighborlists */
175     for(iidx=0; iidx<nri; iidx++)
176     {
177         /* Load shift vector for this list */
178         i_shift_offset   = DIM*shiftidx[iidx];
179
180         /* Load limits for loop over neighbors */
181         j_index_start    = jindex[iidx];
182         j_index_end      = jindex[iidx+1];
183
184         /* Get outer coordinate index */
185         inr              = iinr[iidx];
186         i_coord_offset   = DIM*inr;
187
188         /* Load i particle coords and add shift vector */
189         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
190                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
191
192         fix0             = _mm_setzero_pd();
193         fiy0             = _mm_setzero_pd();
194         fiz0             = _mm_setzero_pd();
195         fix1             = _mm_setzero_pd();
196         fiy1             = _mm_setzero_pd();
197         fiz1             = _mm_setzero_pd();
198         fix2             = _mm_setzero_pd();
199         fiy2             = _mm_setzero_pd();
200         fiz2             = _mm_setzero_pd();
201
202         /* Reset potential sums */
203         velecsum         = _mm_setzero_pd();
204
205         /* Start inner kernel loop */
206         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
207         {
208
209             /* Get j neighbor index, and coordinate index */
210             jnrA             = jjnr[jidx];
211             jnrB             = jjnr[jidx+1];
212             j_coord_offsetA  = DIM*jnrA;
213             j_coord_offsetB  = DIM*jnrB;
214
215             /* load j atom coordinates */
216             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
217                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
218
219             /* Calculate displacement vector */
220             dx00             = _mm_sub_pd(ix0,jx0);
221             dy00             = _mm_sub_pd(iy0,jy0);
222             dz00             = _mm_sub_pd(iz0,jz0);
223             dx01             = _mm_sub_pd(ix0,jx1);
224             dy01             = _mm_sub_pd(iy0,jy1);
225             dz01             = _mm_sub_pd(iz0,jz1);
226             dx02             = _mm_sub_pd(ix0,jx2);
227             dy02             = _mm_sub_pd(iy0,jy2);
228             dz02             = _mm_sub_pd(iz0,jz2);
229             dx10             = _mm_sub_pd(ix1,jx0);
230             dy10             = _mm_sub_pd(iy1,jy0);
231             dz10             = _mm_sub_pd(iz1,jz0);
232             dx11             = _mm_sub_pd(ix1,jx1);
233             dy11             = _mm_sub_pd(iy1,jy1);
234             dz11             = _mm_sub_pd(iz1,jz1);
235             dx12             = _mm_sub_pd(ix1,jx2);
236             dy12             = _mm_sub_pd(iy1,jy2);
237             dz12             = _mm_sub_pd(iz1,jz2);
238             dx20             = _mm_sub_pd(ix2,jx0);
239             dy20             = _mm_sub_pd(iy2,jy0);
240             dz20             = _mm_sub_pd(iz2,jz0);
241             dx21             = _mm_sub_pd(ix2,jx1);
242             dy21             = _mm_sub_pd(iy2,jy1);
243             dz21             = _mm_sub_pd(iz2,jz1);
244             dx22             = _mm_sub_pd(ix2,jx2);
245             dy22             = _mm_sub_pd(iy2,jy2);
246             dz22             = _mm_sub_pd(iz2,jz2);
247
248             /* Calculate squared distance and things based on it */
249             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
250             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
251             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
252             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
253             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
254             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
255             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
256             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
257             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
258
259             rinv00           = sse2_invsqrt_d(rsq00);
260             rinv01           = sse2_invsqrt_d(rsq01);
261             rinv02           = sse2_invsqrt_d(rsq02);
262             rinv10           = sse2_invsqrt_d(rsq10);
263             rinv11           = sse2_invsqrt_d(rsq11);
264             rinv12           = sse2_invsqrt_d(rsq12);
265             rinv20           = sse2_invsqrt_d(rsq20);
266             rinv21           = sse2_invsqrt_d(rsq21);
267             rinv22           = sse2_invsqrt_d(rsq22);
268
269             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
270             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
271             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
272             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
273             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
274             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
275             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
276             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
277             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
278
279             fjx0             = _mm_setzero_pd();
280             fjy0             = _mm_setzero_pd();
281             fjz0             = _mm_setzero_pd();
282             fjx1             = _mm_setzero_pd();
283             fjy1             = _mm_setzero_pd();
284             fjz1             = _mm_setzero_pd();
285             fjx2             = _mm_setzero_pd();
286             fjy2             = _mm_setzero_pd();
287             fjz2             = _mm_setzero_pd();
288
289             /**************************
290              * CALCULATE INTERACTIONS *
291              **************************/
292
293             if (gmx_mm_any_lt(rsq00,rcutoff2))
294             {
295
296             r00              = _mm_mul_pd(rsq00,rinv00);
297
298             /* EWALD ELECTROSTATICS */
299
300             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
301             ewrt             = _mm_mul_pd(r00,ewtabscale);
302             ewitab           = _mm_cvttpd_epi32(ewrt);
303             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
304             ewitab           = _mm_slli_epi32(ewitab,2);
305             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
306             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
307             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
308             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
309             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
310             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
311             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
312             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
313             velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
314             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
315
316             d                = _mm_sub_pd(r00,rswitch);
317             d                = _mm_max_pd(d,_mm_setzero_pd());
318             d2               = _mm_mul_pd(d,d);
319             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
320
321             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
322
323             /* Evaluate switch function */
324             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
325             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
326             velec            = _mm_mul_pd(velec,sw);
327             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
328
329             /* Update potential sum for this i atom from the interaction with this j atom. */
330             velec            = _mm_and_pd(velec,cutoff_mask);
331             velecsum         = _mm_add_pd(velecsum,velec);
332
333             fscal            = felec;
334
335             fscal            = _mm_and_pd(fscal,cutoff_mask);
336
337             /* Calculate temporary vectorial force */
338             tx               = _mm_mul_pd(fscal,dx00);
339             ty               = _mm_mul_pd(fscal,dy00);
340             tz               = _mm_mul_pd(fscal,dz00);
341
342             /* Update vectorial force */
343             fix0             = _mm_add_pd(fix0,tx);
344             fiy0             = _mm_add_pd(fiy0,ty);
345             fiz0             = _mm_add_pd(fiz0,tz);
346
347             fjx0             = _mm_add_pd(fjx0,tx);
348             fjy0             = _mm_add_pd(fjy0,ty);
349             fjz0             = _mm_add_pd(fjz0,tz);
350
351             }
352
353             /**************************
354              * CALCULATE INTERACTIONS *
355              **************************/
356
357             if (gmx_mm_any_lt(rsq01,rcutoff2))
358             {
359
360             r01              = _mm_mul_pd(rsq01,rinv01);
361
362             /* EWALD ELECTROSTATICS */
363
364             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
365             ewrt             = _mm_mul_pd(r01,ewtabscale);
366             ewitab           = _mm_cvttpd_epi32(ewrt);
367             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
368             ewitab           = _mm_slli_epi32(ewitab,2);
369             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
370             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
371             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
372             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
373             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
374             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
375             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
376             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
377             velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
378             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
379
380             d                = _mm_sub_pd(r01,rswitch);
381             d                = _mm_max_pd(d,_mm_setzero_pd());
382             d2               = _mm_mul_pd(d,d);
383             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
384
385             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
386
387             /* Evaluate switch function */
388             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
389             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
390             velec            = _mm_mul_pd(velec,sw);
391             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velec            = _mm_and_pd(velec,cutoff_mask);
395             velecsum         = _mm_add_pd(velecsum,velec);
396
397             fscal            = felec;
398
399             fscal            = _mm_and_pd(fscal,cutoff_mask);
400
401             /* Calculate temporary vectorial force */
402             tx               = _mm_mul_pd(fscal,dx01);
403             ty               = _mm_mul_pd(fscal,dy01);
404             tz               = _mm_mul_pd(fscal,dz01);
405
406             /* Update vectorial force */
407             fix0             = _mm_add_pd(fix0,tx);
408             fiy0             = _mm_add_pd(fiy0,ty);
409             fiz0             = _mm_add_pd(fiz0,tz);
410
411             fjx1             = _mm_add_pd(fjx1,tx);
412             fjy1             = _mm_add_pd(fjy1,ty);
413             fjz1             = _mm_add_pd(fjz1,tz);
414
415             }
416
417             /**************************
418              * CALCULATE INTERACTIONS *
419              **************************/
420
421             if (gmx_mm_any_lt(rsq02,rcutoff2))
422             {
423
424             r02              = _mm_mul_pd(rsq02,rinv02);
425
426             /* EWALD ELECTROSTATICS */
427
428             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429             ewrt             = _mm_mul_pd(r02,ewtabscale);
430             ewitab           = _mm_cvttpd_epi32(ewrt);
431             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
432             ewitab           = _mm_slli_epi32(ewitab,2);
433             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
434             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
435             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
436             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
437             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
438             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
439             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
440             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
441             velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
442             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
443
444             d                = _mm_sub_pd(r02,rswitch);
445             d                = _mm_max_pd(d,_mm_setzero_pd());
446             d2               = _mm_mul_pd(d,d);
447             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
448
449             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
450
451             /* Evaluate switch function */
452             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
453             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
454             velec            = _mm_mul_pd(velec,sw);
455             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
456
457             /* Update potential sum for this i atom from the interaction with this j atom. */
458             velec            = _mm_and_pd(velec,cutoff_mask);
459             velecsum         = _mm_add_pd(velecsum,velec);
460
461             fscal            = felec;
462
463             fscal            = _mm_and_pd(fscal,cutoff_mask);
464
465             /* Calculate temporary vectorial force */
466             tx               = _mm_mul_pd(fscal,dx02);
467             ty               = _mm_mul_pd(fscal,dy02);
468             tz               = _mm_mul_pd(fscal,dz02);
469
470             /* Update vectorial force */
471             fix0             = _mm_add_pd(fix0,tx);
472             fiy0             = _mm_add_pd(fiy0,ty);
473             fiz0             = _mm_add_pd(fiz0,tz);
474
475             fjx2             = _mm_add_pd(fjx2,tx);
476             fjy2             = _mm_add_pd(fjy2,ty);
477             fjz2             = _mm_add_pd(fjz2,tz);
478
479             }
480
481             /**************************
482              * CALCULATE INTERACTIONS *
483              **************************/
484
485             if (gmx_mm_any_lt(rsq10,rcutoff2))
486             {
487
488             r10              = _mm_mul_pd(rsq10,rinv10);
489
490             /* EWALD ELECTROSTATICS */
491
492             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
493             ewrt             = _mm_mul_pd(r10,ewtabscale);
494             ewitab           = _mm_cvttpd_epi32(ewrt);
495             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
496             ewitab           = _mm_slli_epi32(ewitab,2);
497             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
498             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
499             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
500             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
501             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
502             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
503             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
504             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
505             velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
506             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
507
508             d                = _mm_sub_pd(r10,rswitch);
509             d                = _mm_max_pd(d,_mm_setzero_pd());
510             d2               = _mm_mul_pd(d,d);
511             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
512
513             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
514
515             /* Evaluate switch function */
516             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
517             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
518             velec            = _mm_mul_pd(velec,sw);
519             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
520
521             /* Update potential sum for this i atom from the interaction with this j atom. */
522             velec            = _mm_and_pd(velec,cutoff_mask);
523             velecsum         = _mm_add_pd(velecsum,velec);
524
525             fscal            = felec;
526
527             fscal            = _mm_and_pd(fscal,cutoff_mask);
528
529             /* Calculate temporary vectorial force */
530             tx               = _mm_mul_pd(fscal,dx10);
531             ty               = _mm_mul_pd(fscal,dy10);
532             tz               = _mm_mul_pd(fscal,dz10);
533
534             /* Update vectorial force */
535             fix1             = _mm_add_pd(fix1,tx);
536             fiy1             = _mm_add_pd(fiy1,ty);
537             fiz1             = _mm_add_pd(fiz1,tz);
538
539             fjx0             = _mm_add_pd(fjx0,tx);
540             fjy0             = _mm_add_pd(fjy0,ty);
541             fjz0             = _mm_add_pd(fjz0,tz);
542
543             }
544
545             /**************************
546              * CALCULATE INTERACTIONS *
547              **************************/
548
549             if (gmx_mm_any_lt(rsq11,rcutoff2))
550             {
551
552             r11              = _mm_mul_pd(rsq11,rinv11);
553
554             /* EWALD ELECTROSTATICS */
555
556             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
557             ewrt             = _mm_mul_pd(r11,ewtabscale);
558             ewitab           = _mm_cvttpd_epi32(ewrt);
559             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
560             ewitab           = _mm_slli_epi32(ewitab,2);
561             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
562             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
563             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
564             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
565             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
566             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
567             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
568             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
569             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
570             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
571
572             d                = _mm_sub_pd(r11,rswitch);
573             d                = _mm_max_pd(d,_mm_setzero_pd());
574             d2               = _mm_mul_pd(d,d);
575             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
576
577             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
578
579             /* Evaluate switch function */
580             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
581             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
582             velec            = _mm_mul_pd(velec,sw);
583             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
584
585             /* Update potential sum for this i atom from the interaction with this j atom. */
586             velec            = _mm_and_pd(velec,cutoff_mask);
587             velecsum         = _mm_add_pd(velecsum,velec);
588
589             fscal            = felec;
590
591             fscal            = _mm_and_pd(fscal,cutoff_mask);
592
593             /* Calculate temporary vectorial force */
594             tx               = _mm_mul_pd(fscal,dx11);
595             ty               = _mm_mul_pd(fscal,dy11);
596             tz               = _mm_mul_pd(fscal,dz11);
597
598             /* Update vectorial force */
599             fix1             = _mm_add_pd(fix1,tx);
600             fiy1             = _mm_add_pd(fiy1,ty);
601             fiz1             = _mm_add_pd(fiz1,tz);
602
603             fjx1             = _mm_add_pd(fjx1,tx);
604             fjy1             = _mm_add_pd(fjy1,ty);
605             fjz1             = _mm_add_pd(fjz1,tz);
606
607             }
608
609             /**************************
610              * CALCULATE INTERACTIONS *
611              **************************/
612
613             if (gmx_mm_any_lt(rsq12,rcutoff2))
614             {
615
616             r12              = _mm_mul_pd(rsq12,rinv12);
617
618             /* EWALD ELECTROSTATICS */
619
620             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
621             ewrt             = _mm_mul_pd(r12,ewtabscale);
622             ewitab           = _mm_cvttpd_epi32(ewrt);
623             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
624             ewitab           = _mm_slli_epi32(ewitab,2);
625             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
626             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
627             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
628             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
629             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
630             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
631             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
632             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
633             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
634             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
635
636             d                = _mm_sub_pd(r12,rswitch);
637             d                = _mm_max_pd(d,_mm_setzero_pd());
638             d2               = _mm_mul_pd(d,d);
639             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
640
641             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
642
643             /* Evaluate switch function */
644             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
645             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
646             velec            = _mm_mul_pd(velec,sw);
647             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
648
649             /* Update potential sum for this i atom from the interaction with this j atom. */
650             velec            = _mm_and_pd(velec,cutoff_mask);
651             velecsum         = _mm_add_pd(velecsum,velec);
652
653             fscal            = felec;
654
655             fscal            = _mm_and_pd(fscal,cutoff_mask);
656
657             /* Calculate temporary vectorial force */
658             tx               = _mm_mul_pd(fscal,dx12);
659             ty               = _mm_mul_pd(fscal,dy12);
660             tz               = _mm_mul_pd(fscal,dz12);
661
662             /* Update vectorial force */
663             fix1             = _mm_add_pd(fix1,tx);
664             fiy1             = _mm_add_pd(fiy1,ty);
665             fiz1             = _mm_add_pd(fiz1,tz);
666
667             fjx2             = _mm_add_pd(fjx2,tx);
668             fjy2             = _mm_add_pd(fjy2,ty);
669             fjz2             = _mm_add_pd(fjz2,tz);
670
671             }
672
673             /**************************
674              * CALCULATE INTERACTIONS *
675              **************************/
676
677             if (gmx_mm_any_lt(rsq20,rcutoff2))
678             {
679
680             r20              = _mm_mul_pd(rsq20,rinv20);
681
682             /* EWALD ELECTROSTATICS */
683
684             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
685             ewrt             = _mm_mul_pd(r20,ewtabscale);
686             ewitab           = _mm_cvttpd_epi32(ewrt);
687             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
688             ewitab           = _mm_slli_epi32(ewitab,2);
689             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
690             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
691             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
692             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
693             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
694             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
695             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
696             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
697             velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
698             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
699
700             d                = _mm_sub_pd(r20,rswitch);
701             d                = _mm_max_pd(d,_mm_setzero_pd());
702             d2               = _mm_mul_pd(d,d);
703             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
704
705             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
706
707             /* Evaluate switch function */
708             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
709             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
710             velec            = _mm_mul_pd(velec,sw);
711             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
712
713             /* Update potential sum for this i atom from the interaction with this j atom. */
714             velec            = _mm_and_pd(velec,cutoff_mask);
715             velecsum         = _mm_add_pd(velecsum,velec);
716
717             fscal            = felec;
718
719             fscal            = _mm_and_pd(fscal,cutoff_mask);
720
721             /* Calculate temporary vectorial force */
722             tx               = _mm_mul_pd(fscal,dx20);
723             ty               = _mm_mul_pd(fscal,dy20);
724             tz               = _mm_mul_pd(fscal,dz20);
725
726             /* Update vectorial force */
727             fix2             = _mm_add_pd(fix2,tx);
728             fiy2             = _mm_add_pd(fiy2,ty);
729             fiz2             = _mm_add_pd(fiz2,tz);
730
731             fjx0             = _mm_add_pd(fjx0,tx);
732             fjy0             = _mm_add_pd(fjy0,ty);
733             fjz0             = _mm_add_pd(fjz0,tz);
734
735             }
736
737             /**************************
738              * CALCULATE INTERACTIONS *
739              **************************/
740
741             if (gmx_mm_any_lt(rsq21,rcutoff2))
742             {
743
744             r21              = _mm_mul_pd(rsq21,rinv21);
745
746             /* EWALD ELECTROSTATICS */
747
748             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
749             ewrt             = _mm_mul_pd(r21,ewtabscale);
750             ewitab           = _mm_cvttpd_epi32(ewrt);
751             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
752             ewitab           = _mm_slli_epi32(ewitab,2);
753             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
754             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
755             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
756             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
757             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
758             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
759             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
760             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
761             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
762             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
763
764             d                = _mm_sub_pd(r21,rswitch);
765             d                = _mm_max_pd(d,_mm_setzero_pd());
766             d2               = _mm_mul_pd(d,d);
767             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
768
769             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
770
771             /* Evaluate switch function */
772             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
773             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
774             velec            = _mm_mul_pd(velec,sw);
775             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
776
777             /* Update potential sum for this i atom from the interaction with this j atom. */
778             velec            = _mm_and_pd(velec,cutoff_mask);
779             velecsum         = _mm_add_pd(velecsum,velec);
780
781             fscal            = felec;
782
783             fscal            = _mm_and_pd(fscal,cutoff_mask);
784
785             /* Calculate temporary vectorial force */
786             tx               = _mm_mul_pd(fscal,dx21);
787             ty               = _mm_mul_pd(fscal,dy21);
788             tz               = _mm_mul_pd(fscal,dz21);
789
790             /* Update vectorial force */
791             fix2             = _mm_add_pd(fix2,tx);
792             fiy2             = _mm_add_pd(fiy2,ty);
793             fiz2             = _mm_add_pd(fiz2,tz);
794
795             fjx1             = _mm_add_pd(fjx1,tx);
796             fjy1             = _mm_add_pd(fjy1,ty);
797             fjz1             = _mm_add_pd(fjz1,tz);
798
799             }
800
801             /**************************
802              * CALCULATE INTERACTIONS *
803              **************************/
804
805             if (gmx_mm_any_lt(rsq22,rcutoff2))
806             {
807
808             r22              = _mm_mul_pd(rsq22,rinv22);
809
810             /* EWALD ELECTROSTATICS */
811
812             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
813             ewrt             = _mm_mul_pd(r22,ewtabscale);
814             ewitab           = _mm_cvttpd_epi32(ewrt);
815             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
816             ewitab           = _mm_slli_epi32(ewitab,2);
817             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
818             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
819             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
820             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
821             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
822             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
823             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
824             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
825             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
826             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
827
828             d                = _mm_sub_pd(r22,rswitch);
829             d                = _mm_max_pd(d,_mm_setzero_pd());
830             d2               = _mm_mul_pd(d,d);
831             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
832
833             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
834
835             /* Evaluate switch function */
836             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
837             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
838             velec            = _mm_mul_pd(velec,sw);
839             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
840
841             /* Update potential sum for this i atom from the interaction with this j atom. */
842             velec            = _mm_and_pd(velec,cutoff_mask);
843             velecsum         = _mm_add_pd(velecsum,velec);
844
845             fscal            = felec;
846
847             fscal            = _mm_and_pd(fscal,cutoff_mask);
848
849             /* Calculate temporary vectorial force */
850             tx               = _mm_mul_pd(fscal,dx22);
851             ty               = _mm_mul_pd(fscal,dy22);
852             tz               = _mm_mul_pd(fscal,dz22);
853
854             /* Update vectorial force */
855             fix2             = _mm_add_pd(fix2,tx);
856             fiy2             = _mm_add_pd(fiy2,ty);
857             fiz2             = _mm_add_pd(fiz2,tz);
858
859             fjx2             = _mm_add_pd(fjx2,tx);
860             fjy2             = _mm_add_pd(fjy2,ty);
861             fjz2             = _mm_add_pd(fjz2,tz);
862
863             }
864
865             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
866
867             /* Inner loop uses 585 flops */
868         }
869
870         if(jidx<j_index_end)
871         {
872
873             jnrA             = jjnr[jidx];
874             j_coord_offsetA  = DIM*jnrA;
875
876             /* load j atom coordinates */
877             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
878                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
879
880             /* Calculate displacement vector */
881             dx00             = _mm_sub_pd(ix0,jx0);
882             dy00             = _mm_sub_pd(iy0,jy0);
883             dz00             = _mm_sub_pd(iz0,jz0);
884             dx01             = _mm_sub_pd(ix0,jx1);
885             dy01             = _mm_sub_pd(iy0,jy1);
886             dz01             = _mm_sub_pd(iz0,jz1);
887             dx02             = _mm_sub_pd(ix0,jx2);
888             dy02             = _mm_sub_pd(iy0,jy2);
889             dz02             = _mm_sub_pd(iz0,jz2);
890             dx10             = _mm_sub_pd(ix1,jx0);
891             dy10             = _mm_sub_pd(iy1,jy0);
892             dz10             = _mm_sub_pd(iz1,jz0);
893             dx11             = _mm_sub_pd(ix1,jx1);
894             dy11             = _mm_sub_pd(iy1,jy1);
895             dz11             = _mm_sub_pd(iz1,jz1);
896             dx12             = _mm_sub_pd(ix1,jx2);
897             dy12             = _mm_sub_pd(iy1,jy2);
898             dz12             = _mm_sub_pd(iz1,jz2);
899             dx20             = _mm_sub_pd(ix2,jx0);
900             dy20             = _mm_sub_pd(iy2,jy0);
901             dz20             = _mm_sub_pd(iz2,jz0);
902             dx21             = _mm_sub_pd(ix2,jx1);
903             dy21             = _mm_sub_pd(iy2,jy1);
904             dz21             = _mm_sub_pd(iz2,jz1);
905             dx22             = _mm_sub_pd(ix2,jx2);
906             dy22             = _mm_sub_pd(iy2,jy2);
907             dz22             = _mm_sub_pd(iz2,jz2);
908
909             /* Calculate squared distance and things based on it */
910             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
911             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
912             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
913             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
914             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
915             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
916             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
917             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
918             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
919
920             rinv00           = sse2_invsqrt_d(rsq00);
921             rinv01           = sse2_invsqrt_d(rsq01);
922             rinv02           = sse2_invsqrt_d(rsq02);
923             rinv10           = sse2_invsqrt_d(rsq10);
924             rinv11           = sse2_invsqrt_d(rsq11);
925             rinv12           = sse2_invsqrt_d(rsq12);
926             rinv20           = sse2_invsqrt_d(rsq20);
927             rinv21           = sse2_invsqrt_d(rsq21);
928             rinv22           = sse2_invsqrt_d(rsq22);
929
930             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
931             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
932             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
933             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
934             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
935             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
936             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
937             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
938             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
939
940             fjx0             = _mm_setzero_pd();
941             fjy0             = _mm_setzero_pd();
942             fjz0             = _mm_setzero_pd();
943             fjx1             = _mm_setzero_pd();
944             fjy1             = _mm_setzero_pd();
945             fjz1             = _mm_setzero_pd();
946             fjx2             = _mm_setzero_pd();
947             fjy2             = _mm_setzero_pd();
948             fjz2             = _mm_setzero_pd();
949
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             if (gmx_mm_any_lt(rsq00,rcutoff2))
955             {
956
957             r00              = _mm_mul_pd(rsq00,rinv00);
958
959             /* EWALD ELECTROSTATICS */
960
961             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
962             ewrt             = _mm_mul_pd(r00,ewtabscale);
963             ewitab           = _mm_cvttpd_epi32(ewrt);
964             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
965             ewitab           = _mm_slli_epi32(ewitab,2);
966             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
967             ewtabD           = _mm_setzero_pd();
968             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
969             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
970             ewtabFn          = _mm_setzero_pd();
971             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
972             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
973             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
974             velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
975             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
976
977             d                = _mm_sub_pd(r00,rswitch);
978             d                = _mm_max_pd(d,_mm_setzero_pd());
979             d2               = _mm_mul_pd(d,d);
980             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
981
982             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
983
984             /* Evaluate switch function */
985             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
986             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
987             velec            = _mm_mul_pd(velec,sw);
988             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
989
990             /* Update potential sum for this i atom from the interaction with this j atom. */
991             velec            = _mm_and_pd(velec,cutoff_mask);
992             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
993             velecsum         = _mm_add_pd(velecsum,velec);
994
995             fscal            = felec;
996
997             fscal            = _mm_and_pd(fscal,cutoff_mask);
998
999             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1000
1001             /* Calculate temporary vectorial force */
1002             tx               = _mm_mul_pd(fscal,dx00);
1003             ty               = _mm_mul_pd(fscal,dy00);
1004             tz               = _mm_mul_pd(fscal,dz00);
1005
1006             /* Update vectorial force */
1007             fix0             = _mm_add_pd(fix0,tx);
1008             fiy0             = _mm_add_pd(fiy0,ty);
1009             fiz0             = _mm_add_pd(fiz0,tz);
1010
1011             fjx0             = _mm_add_pd(fjx0,tx);
1012             fjy0             = _mm_add_pd(fjy0,ty);
1013             fjz0             = _mm_add_pd(fjz0,tz);
1014
1015             }
1016
1017             /**************************
1018              * CALCULATE INTERACTIONS *
1019              **************************/
1020
1021             if (gmx_mm_any_lt(rsq01,rcutoff2))
1022             {
1023
1024             r01              = _mm_mul_pd(rsq01,rinv01);
1025
1026             /* EWALD ELECTROSTATICS */
1027
1028             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1029             ewrt             = _mm_mul_pd(r01,ewtabscale);
1030             ewitab           = _mm_cvttpd_epi32(ewrt);
1031             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1032             ewitab           = _mm_slli_epi32(ewitab,2);
1033             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1034             ewtabD           = _mm_setzero_pd();
1035             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1036             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1037             ewtabFn          = _mm_setzero_pd();
1038             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1039             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1040             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1041             velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1042             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1043
1044             d                = _mm_sub_pd(r01,rswitch);
1045             d                = _mm_max_pd(d,_mm_setzero_pd());
1046             d2               = _mm_mul_pd(d,d);
1047             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1048
1049             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1050
1051             /* Evaluate switch function */
1052             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1053             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1054             velec            = _mm_mul_pd(velec,sw);
1055             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1056
1057             /* Update potential sum for this i atom from the interaction with this j atom. */
1058             velec            = _mm_and_pd(velec,cutoff_mask);
1059             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1060             velecsum         = _mm_add_pd(velecsum,velec);
1061
1062             fscal            = felec;
1063
1064             fscal            = _mm_and_pd(fscal,cutoff_mask);
1065
1066             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1067
1068             /* Calculate temporary vectorial force */
1069             tx               = _mm_mul_pd(fscal,dx01);
1070             ty               = _mm_mul_pd(fscal,dy01);
1071             tz               = _mm_mul_pd(fscal,dz01);
1072
1073             /* Update vectorial force */
1074             fix0             = _mm_add_pd(fix0,tx);
1075             fiy0             = _mm_add_pd(fiy0,ty);
1076             fiz0             = _mm_add_pd(fiz0,tz);
1077
1078             fjx1             = _mm_add_pd(fjx1,tx);
1079             fjy1             = _mm_add_pd(fjy1,ty);
1080             fjz1             = _mm_add_pd(fjz1,tz);
1081
1082             }
1083
1084             /**************************
1085              * CALCULATE INTERACTIONS *
1086              **************************/
1087
1088             if (gmx_mm_any_lt(rsq02,rcutoff2))
1089             {
1090
1091             r02              = _mm_mul_pd(rsq02,rinv02);
1092
1093             /* EWALD ELECTROSTATICS */
1094
1095             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1096             ewrt             = _mm_mul_pd(r02,ewtabscale);
1097             ewitab           = _mm_cvttpd_epi32(ewrt);
1098             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1099             ewitab           = _mm_slli_epi32(ewitab,2);
1100             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1101             ewtabD           = _mm_setzero_pd();
1102             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1103             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1104             ewtabFn          = _mm_setzero_pd();
1105             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1106             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1107             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1108             velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1109             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1110
1111             d                = _mm_sub_pd(r02,rswitch);
1112             d                = _mm_max_pd(d,_mm_setzero_pd());
1113             d2               = _mm_mul_pd(d,d);
1114             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1115
1116             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1117
1118             /* Evaluate switch function */
1119             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1120             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1121             velec            = _mm_mul_pd(velec,sw);
1122             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1123
1124             /* Update potential sum for this i atom from the interaction with this j atom. */
1125             velec            = _mm_and_pd(velec,cutoff_mask);
1126             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1127             velecsum         = _mm_add_pd(velecsum,velec);
1128
1129             fscal            = felec;
1130
1131             fscal            = _mm_and_pd(fscal,cutoff_mask);
1132
1133             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1134
1135             /* Calculate temporary vectorial force */
1136             tx               = _mm_mul_pd(fscal,dx02);
1137             ty               = _mm_mul_pd(fscal,dy02);
1138             tz               = _mm_mul_pd(fscal,dz02);
1139
1140             /* Update vectorial force */
1141             fix0             = _mm_add_pd(fix0,tx);
1142             fiy0             = _mm_add_pd(fiy0,ty);
1143             fiz0             = _mm_add_pd(fiz0,tz);
1144
1145             fjx2             = _mm_add_pd(fjx2,tx);
1146             fjy2             = _mm_add_pd(fjy2,ty);
1147             fjz2             = _mm_add_pd(fjz2,tz);
1148
1149             }
1150
1151             /**************************
1152              * CALCULATE INTERACTIONS *
1153              **************************/
1154
1155             if (gmx_mm_any_lt(rsq10,rcutoff2))
1156             {
1157
1158             r10              = _mm_mul_pd(rsq10,rinv10);
1159
1160             /* EWALD ELECTROSTATICS */
1161
1162             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1163             ewrt             = _mm_mul_pd(r10,ewtabscale);
1164             ewitab           = _mm_cvttpd_epi32(ewrt);
1165             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1166             ewitab           = _mm_slli_epi32(ewitab,2);
1167             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1168             ewtabD           = _mm_setzero_pd();
1169             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1170             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1171             ewtabFn          = _mm_setzero_pd();
1172             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1173             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1174             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1175             velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1176             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1177
1178             d                = _mm_sub_pd(r10,rswitch);
1179             d                = _mm_max_pd(d,_mm_setzero_pd());
1180             d2               = _mm_mul_pd(d,d);
1181             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1182
1183             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1184
1185             /* Evaluate switch function */
1186             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1187             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1188             velec            = _mm_mul_pd(velec,sw);
1189             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
1190
1191             /* Update potential sum for this i atom from the interaction with this j atom. */
1192             velec            = _mm_and_pd(velec,cutoff_mask);
1193             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1194             velecsum         = _mm_add_pd(velecsum,velec);
1195
1196             fscal            = felec;
1197
1198             fscal            = _mm_and_pd(fscal,cutoff_mask);
1199
1200             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1201
1202             /* Calculate temporary vectorial force */
1203             tx               = _mm_mul_pd(fscal,dx10);
1204             ty               = _mm_mul_pd(fscal,dy10);
1205             tz               = _mm_mul_pd(fscal,dz10);
1206
1207             /* Update vectorial force */
1208             fix1             = _mm_add_pd(fix1,tx);
1209             fiy1             = _mm_add_pd(fiy1,ty);
1210             fiz1             = _mm_add_pd(fiz1,tz);
1211
1212             fjx0             = _mm_add_pd(fjx0,tx);
1213             fjy0             = _mm_add_pd(fjy0,ty);
1214             fjz0             = _mm_add_pd(fjz0,tz);
1215
1216             }
1217
1218             /**************************
1219              * CALCULATE INTERACTIONS *
1220              **************************/
1221
1222             if (gmx_mm_any_lt(rsq11,rcutoff2))
1223             {
1224
1225             r11              = _mm_mul_pd(rsq11,rinv11);
1226
1227             /* EWALD ELECTROSTATICS */
1228
1229             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1230             ewrt             = _mm_mul_pd(r11,ewtabscale);
1231             ewitab           = _mm_cvttpd_epi32(ewrt);
1232             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1233             ewitab           = _mm_slli_epi32(ewitab,2);
1234             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1235             ewtabD           = _mm_setzero_pd();
1236             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1237             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1238             ewtabFn          = _mm_setzero_pd();
1239             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1240             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1241             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1242             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1243             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1244
1245             d                = _mm_sub_pd(r11,rswitch);
1246             d                = _mm_max_pd(d,_mm_setzero_pd());
1247             d2               = _mm_mul_pd(d,d);
1248             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1249
1250             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1251
1252             /* Evaluate switch function */
1253             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1254             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1255             velec            = _mm_mul_pd(velec,sw);
1256             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
1257
1258             /* Update potential sum for this i atom from the interaction with this j atom. */
1259             velec            = _mm_and_pd(velec,cutoff_mask);
1260             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1261             velecsum         = _mm_add_pd(velecsum,velec);
1262
1263             fscal            = felec;
1264
1265             fscal            = _mm_and_pd(fscal,cutoff_mask);
1266
1267             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1268
1269             /* Calculate temporary vectorial force */
1270             tx               = _mm_mul_pd(fscal,dx11);
1271             ty               = _mm_mul_pd(fscal,dy11);
1272             tz               = _mm_mul_pd(fscal,dz11);
1273
1274             /* Update vectorial force */
1275             fix1             = _mm_add_pd(fix1,tx);
1276             fiy1             = _mm_add_pd(fiy1,ty);
1277             fiz1             = _mm_add_pd(fiz1,tz);
1278
1279             fjx1             = _mm_add_pd(fjx1,tx);
1280             fjy1             = _mm_add_pd(fjy1,ty);
1281             fjz1             = _mm_add_pd(fjz1,tz);
1282
1283             }
1284
1285             /**************************
1286              * CALCULATE INTERACTIONS *
1287              **************************/
1288
1289             if (gmx_mm_any_lt(rsq12,rcutoff2))
1290             {
1291
1292             r12              = _mm_mul_pd(rsq12,rinv12);
1293
1294             /* EWALD ELECTROSTATICS */
1295
1296             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1297             ewrt             = _mm_mul_pd(r12,ewtabscale);
1298             ewitab           = _mm_cvttpd_epi32(ewrt);
1299             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1300             ewitab           = _mm_slli_epi32(ewitab,2);
1301             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1302             ewtabD           = _mm_setzero_pd();
1303             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1304             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1305             ewtabFn          = _mm_setzero_pd();
1306             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1307             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1308             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1309             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1310             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1311
1312             d                = _mm_sub_pd(r12,rswitch);
1313             d                = _mm_max_pd(d,_mm_setzero_pd());
1314             d2               = _mm_mul_pd(d,d);
1315             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1316
1317             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1318
1319             /* Evaluate switch function */
1320             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1321             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1322             velec            = _mm_mul_pd(velec,sw);
1323             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
1324
1325             /* Update potential sum for this i atom from the interaction with this j atom. */
1326             velec            = _mm_and_pd(velec,cutoff_mask);
1327             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1328             velecsum         = _mm_add_pd(velecsum,velec);
1329
1330             fscal            = felec;
1331
1332             fscal            = _mm_and_pd(fscal,cutoff_mask);
1333
1334             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1335
1336             /* Calculate temporary vectorial force */
1337             tx               = _mm_mul_pd(fscal,dx12);
1338             ty               = _mm_mul_pd(fscal,dy12);
1339             tz               = _mm_mul_pd(fscal,dz12);
1340
1341             /* Update vectorial force */
1342             fix1             = _mm_add_pd(fix1,tx);
1343             fiy1             = _mm_add_pd(fiy1,ty);
1344             fiz1             = _mm_add_pd(fiz1,tz);
1345
1346             fjx2             = _mm_add_pd(fjx2,tx);
1347             fjy2             = _mm_add_pd(fjy2,ty);
1348             fjz2             = _mm_add_pd(fjz2,tz);
1349
1350             }
1351
1352             /**************************
1353              * CALCULATE INTERACTIONS *
1354              **************************/
1355
1356             if (gmx_mm_any_lt(rsq20,rcutoff2))
1357             {
1358
1359             r20              = _mm_mul_pd(rsq20,rinv20);
1360
1361             /* EWALD ELECTROSTATICS */
1362
1363             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1364             ewrt             = _mm_mul_pd(r20,ewtabscale);
1365             ewitab           = _mm_cvttpd_epi32(ewrt);
1366             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1367             ewitab           = _mm_slli_epi32(ewitab,2);
1368             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1369             ewtabD           = _mm_setzero_pd();
1370             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1371             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1372             ewtabFn          = _mm_setzero_pd();
1373             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1374             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1375             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1376             velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1377             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1378
1379             d                = _mm_sub_pd(r20,rswitch);
1380             d                = _mm_max_pd(d,_mm_setzero_pd());
1381             d2               = _mm_mul_pd(d,d);
1382             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1383
1384             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1385
1386             /* Evaluate switch function */
1387             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1388             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1389             velec            = _mm_mul_pd(velec,sw);
1390             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
1391
1392             /* Update potential sum for this i atom from the interaction with this j atom. */
1393             velec            = _mm_and_pd(velec,cutoff_mask);
1394             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1395             velecsum         = _mm_add_pd(velecsum,velec);
1396
1397             fscal            = felec;
1398
1399             fscal            = _mm_and_pd(fscal,cutoff_mask);
1400
1401             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1402
1403             /* Calculate temporary vectorial force */
1404             tx               = _mm_mul_pd(fscal,dx20);
1405             ty               = _mm_mul_pd(fscal,dy20);
1406             tz               = _mm_mul_pd(fscal,dz20);
1407
1408             /* Update vectorial force */
1409             fix2             = _mm_add_pd(fix2,tx);
1410             fiy2             = _mm_add_pd(fiy2,ty);
1411             fiz2             = _mm_add_pd(fiz2,tz);
1412
1413             fjx0             = _mm_add_pd(fjx0,tx);
1414             fjy0             = _mm_add_pd(fjy0,ty);
1415             fjz0             = _mm_add_pd(fjz0,tz);
1416
1417             }
1418
1419             /**************************
1420              * CALCULATE INTERACTIONS *
1421              **************************/
1422
1423             if (gmx_mm_any_lt(rsq21,rcutoff2))
1424             {
1425
1426             r21              = _mm_mul_pd(rsq21,rinv21);
1427
1428             /* EWALD ELECTROSTATICS */
1429
1430             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1431             ewrt             = _mm_mul_pd(r21,ewtabscale);
1432             ewitab           = _mm_cvttpd_epi32(ewrt);
1433             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1434             ewitab           = _mm_slli_epi32(ewitab,2);
1435             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1436             ewtabD           = _mm_setzero_pd();
1437             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1438             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1439             ewtabFn          = _mm_setzero_pd();
1440             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1441             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1442             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1443             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1444             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1445
1446             d                = _mm_sub_pd(r21,rswitch);
1447             d                = _mm_max_pd(d,_mm_setzero_pd());
1448             d2               = _mm_mul_pd(d,d);
1449             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1450
1451             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1452
1453             /* Evaluate switch function */
1454             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1455             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1456             velec            = _mm_mul_pd(velec,sw);
1457             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
1458
1459             /* Update potential sum for this i atom from the interaction with this j atom. */
1460             velec            = _mm_and_pd(velec,cutoff_mask);
1461             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1462             velecsum         = _mm_add_pd(velecsum,velec);
1463
1464             fscal            = felec;
1465
1466             fscal            = _mm_and_pd(fscal,cutoff_mask);
1467
1468             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1469
1470             /* Calculate temporary vectorial force */
1471             tx               = _mm_mul_pd(fscal,dx21);
1472             ty               = _mm_mul_pd(fscal,dy21);
1473             tz               = _mm_mul_pd(fscal,dz21);
1474
1475             /* Update vectorial force */
1476             fix2             = _mm_add_pd(fix2,tx);
1477             fiy2             = _mm_add_pd(fiy2,ty);
1478             fiz2             = _mm_add_pd(fiz2,tz);
1479
1480             fjx1             = _mm_add_pd(fjx1,tx);
1481             fjy1             = _mm_add_pd(fjy1,ty);
1482             fjz1             = _mm_add_pd(fjz1,tz);
1483
1484             }
1485
1486             /**************************
1487              * CALCULATE INTERACTIONS *
1488              **************************/
1489
1490             if (gmx_mm_any_lt(rsq22,rcutoff2))
1491             {
1492
1493             r22              = _mm_mul_pd(rsq22,rinv22);
1494
1495             /* EWALD ELECTROSTATICS */
1496
1497             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1498             ewrt             = _mm_mul_pd(r22,ewtabscale);
1499             ewitab           = _mm_cvttpd_epi32(ewrt);
1500             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1501             ewitab           = _mm_slli_epi32(ewitab,2);
1502             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1503             ewtabD           = _mm_setzero_pd();
1504             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1505             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1506             ewtabFn          = _mm_setzero_pd();
1507             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1508             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1509             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1510             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1511             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1512
1513             d                = _mm_sub_pd(r22,rswitch);
1514             d                = _mm_max_pd(d,_mm_setzero_pd());
1515             d2               = _mm_mul_pd(d,d);
1516             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1517
1518             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1519
1520             /* Evaluate switch function */
1521             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1522             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1523             velec            = _mm_mul_pd(velec,sw);
1524             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
1525
1526             /* Update potential sum for this i atom from the interaction with this j atom. */
1527             velec            = _mm_and_pd(velec,cutoff_mask);
1528             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1529             velecsum         = _mm_add_pd(velecsum,velec);
1530
1531             fscal            = felec;
1532
1533             fscal            = _mm_and_pd(fscal,cutoff_mask);
1534
1535             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1536
1537             /* Calculate temporary vectorial force */
1538             tx               = _mm_mul_pd(fscal,dx22);
1539             ty               = _mm_mul_pd(fscal,dy22);
1540             tz               = _mm_mul_pd(fscal,dz22);
1541
1542             /* Update vectorial force */
1543             fix2             = _mm_add_pd(fix2,tx);
1544             fiy2             = _mm_add_pd(fiy2,ty);
1545             fiz2             = _mm_add_pd(fiz2,tz);
1546
1547             fjx2             = _mm_add_pd(fjx2,tx);
1548             fjy2             = _mm_add_pd(fjy2,ty);
1549             fjz2             = _mm_add_pd(fjz2,tz);
1550
1551             }
1552
1553             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1554
1555             /* Inner loop uses 585 flops */
1556         }
1557
1558         /* End of innermost loop */
1559
1560         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1561                                               f+i_coord_offset,fshift+i_shift_offset);
1562
1563         ggid                        = gid[iidx];
1564         /* Update potential energies */
1565         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1566
1567         /* Increment number of inner iterations */
1568         inneriter                  += j_index_end - j_index_start;
1569
1570         /* Outer loop uses 19 flops */
1571     }
1572
1573     /* Increment number of outer iterations */
1574     outeriter        += nri;
1575
1576     /* Update outer/inner flops */
1577
1578     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*585);
1579 }
1580 /*
1581  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_double
1582  * Electrostatics interaction: Ewald
1583  * VdW interaction:            None
1584  * Geometry:                   Water3-Water3
1585  * Calculate force/pot:        Force
1586  */
1587 void
1588 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_double
1589                     (t_nblist                    * gmx_restrict       nlist,
1590                      rvec                        * gmx_restrict          xx,
1591                      rvec                        * gmx_restrict          ff,
1592                      struct t_forcerec           * gmx_restrict          fr,
1593                      t_mdatoms                   * gmx_restrict     mdatoms,
1594                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1595                      t_nrnb                      * gmx_restrict        nrnb)
1596 {
1597     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1598      * just 0 for non-waters.
1599      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1600      * jnr indices corresponding to data put in the four positions in the SIMD register.
1601      */
1602     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1603     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1604     int              jnrA,jnrB;
1605     int              j_coord_offsetA,j_coord_offsetB;
1606     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1607     real             rcutoff_scalar;
1608     real             *shiftvec,*fshift,*x,*f;
1609     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1610     int              vdwioffset0;
1611     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1612     int              vdwioffset1;
1613     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1614     int              vdwioffset2;
1615     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1616     int              vdwjidx0A,vdwjidx0B;
1617     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1618     int              vdwjidx1A,vdwjidx1B;
1619     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1620     int              vdwjidx2A,vdwjidx2B;
1621     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1622     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1623     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1624     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1625     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1626     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1627     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1628     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1629     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1630     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1631     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1632     real             *charge;
1633     __m128i          ewitab;
1634     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1635     real             *ewtab;
1636     __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1637     real             rswitch_scalar,d_scalar;
1638     __m128d          dummy_mask,cutoff_mask;
1639     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1640     __m128d          one     = _mm_set1_pd(1.0);
1641     __m128d          two     = _mm_set1_pd(2.0);
1642     x                = xx[0];
1643     f                = ff[0];
1644
1645     nri              = nlist->nri;
1646     iinr             = nlist->iinr;
1647     jindex           = nlist->jindex;
1648     jjnr             = nlist->jjnr;
1649     shiftidx         = nlist->shift;
1650     gid              = nlist->gid;
1651     shiftvec         = fr->shift_vec[0];
1652     fshift           = fr->fshift[0];
1653     facel            = _mm_set1_pd(fr->ic->epsfac);
1654     charge           = mdatoms->chargeA;
1655
1656     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1657     ewtab            = fr->ic->tabq_coul_FDV0;
1658     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1659     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1660
1661     /* Setup water-specific parameters */
1662     inr              = nlist->iinr[0];
1663     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1664     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1665     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1666
1667     jq0              = _mm_set1_pd(charge[inr+0]);
1668     jq1              = _mm_set1_pd(charge[inr+1]);
1669     jq2              = _mm_set1_pd(charge[inr+2]);
1670     qq00             = _mm_mul_pd(iq0,jq0);
1671     qq01             = _mm_mul_pd(iq0,jq1);
1672     qq02             = _mm_mul_pd(iq0,jq2);
1673     qq10             = _mm_mul_pd(iq1,jq0);
1674     qq11             = _mm_mul_pd(iq1,jq1);
1675     qq12             = _mm_mul_pd(iq1,jq2);
1676     qq20             = _mm_mul_pd(iq2,jq0);
1677     qq21             = _mm_mul_pd(iq2,jq1);
1678     qq22             = _mm_mul_pd(iq2,jq2);
1679
1680     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1681     rcutoff_scalar   = fr->ic->rcoulomb;
1682     rcutoff          = _mm_set1_pd(rcutoff_scalar);
1683     rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
1684
1685     rswitch_scalar   = fr->ic->rcoulomb_switch;
1686     rswitch          = _mm_set1_pd(rswitch_scalar);
1687     /* Setup switch parameters */
1688     d_scalar         = rcutoff_scalar-rswitch_scalar;
1689     d                = _mm_set1_pd(d_scalar);
1690     swV3             = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1691     swV4             = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1692     swV5             = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1693     swF2             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1694     swF3             = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1695     swF4             = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1696
1697     /* Avoid stupid compiler warnings */
1698     jnrA = jnrB = 0;
1699     j_coord_offsetA = 0;
1700     j_coord_offsetB = 0;
1701
1702     outeriter        = 0;
1703     inneriter        = 0;
1704
1705     /* Start outer loop over neighborlists */
1706     for(iidx=0; iidx<nri; iidx++)
1707     {
1708         /* Load shift vector for this list */
1709         i_shift_offset   = DIM*shiftidx[iidx];
1710
1711         /* Load limits for loop over neighbors */
1712         j_index_start    = jindex[iidx];
1713         j_index_end      = jindex[iidx+1];
1714
1715         /* Get outer coordinate index */
1716         inr              = iinr[iidx];
1717         i_coord_offset   = DIM*inr;
1718
1719         /* Load i particle coords and add shift vector */
1720         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1721                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1722
1723         fix0             = _mm_setzero_pd();
1724         fiy0             = _mm_setzero_pd();
1725         fiz0             = _mm_setzero_pd();
1726         fix1             = _mm_setzero_pd();
1727         fiy1             = _mm_setzero_pd();
1728         fiz1             = _mm_setzero_pd();
1729         fix2             = _mm_setzero_pd();
1730         fiy2             = _mm_setzero_pd();
1731         fiz2             = _mm_setzero_pd();
1732
1733         /* Start inner kernel loop */
1734         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1735         {
1736
1737             /* Get j neighbor index, and coordinate index */
1738             jnrA             = jjnr[jidx];
1739             jnrB             = jjnr[jidx+1];
1740             j_coord_offsetA  = DIM*jnrA;
1741             j_coord_offsetB  = DIM*jnrB;
1742
1743             /* load j atom coordinates */
1744             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1745                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1746
1747             /* Calculate displacement vector */
1748             dx00             = _mm_sub_pd(ix0,jx0);
1749             dy00             = _mm_sub_pd(iy0,jy0);
1750             dz00             = _mm_sub_pd(iz0,jz0);
1751             dx01             = _mm_sub_pd(ix0,jx1);
1752             dy01             = _mm_sub_pd(iy0,jy1);
1753             dz01             = _mm_sub_pd(iz0,jz1);
1754             dx02             = _mm_sub_pd(ix0,jx2);
1755             dy02             = _mm_sub_pd(iy0,jy2);
1756             dz02             = _mm_sub_pd(iz0,jz2);
1757             dx10             = _mm_sub_pd(ix1,jx0);
1758             dy10             = _mm_sub_pd(iy1,jy0);
1759             dz10             = _mm_sub_pd(iz1,jz0);
1760             dx11             = _mm_sub_pd(ix1,jx1);
1761             dy11             = _mm_sub_pd(iy1,jy1);
1762             dz11             = _mm_sub_pd(iz1,jz1);
1763             dx12             = _mm_sub_pd(ix1,jx2);
1764             dy12             = _mm_sub_pd(iy1,jy2);
1765             dz12             = _mm_sub_pd(iz1,jz2);
1766             dx20             = _mm_sub_pd(ix2,jx0);
1767             dy20             = _mm_sub_pd(iy2,jy0);
1768             dz20             = _mm_sub_pd(iz2,jz0);
1769             dx21             = _mm_sub_pd(ix2,jx1);
1770             dy21             = _mm_sub_pd(iy2,jy1);
1771             dz21             = _mm_sub_pd(iz2,jz1);
1772             dx22             = _mm_sub_pd(ix2,jx2);
1773             dy22             = _mm_sub_pd(iy2,jy2);
1774             dz22             = _mm_sub_pd(iz2,jz2);
1775
1776             /* Calculate squared distance and things based on it */
1777             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1778             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1779             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1780             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1781             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1782             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1783             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1784             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1785             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1786
1787             rinv00           = sse2_invsqrt_d(rsq00);
1788             rinv01           = sse2_invsqrt_d(rsq01);
1789             rinv02           = sse2_invsqrt_d(rsq02);
1790             rinv10           = sse2_invsqrt_d(rsq10);
1791             rinv11           = sse2_invsqrt_d(rsq11);
1792             rinv12           = sse2_invsqrt_d(rsq12);
1793             rinv20           = sse2_invsqrt_d(rsq20);
1794             rinv21           = sse2_invsqrt_d(rsq21);
1795             rinv22           = sse2_invsqrt_d(rsq22);
1796
1797             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1798             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1799             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1800             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1801             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1802             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1803             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1804             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1805             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1806
1807             fjx0             = _mm_setzero_pd();
1808             fjy0             = _mm_setzero_pd();
1809             fjz0             = _mm_setzero_pd();
1810             fjx1             = _mm_setzero_pd();
1811             fjy1             = _mm_setzero_pd();
1812             fjz1             = _mm_setzero_pd();
1813             fjx2             = _mm_setzero_pd();
1814             fjy2             = _mm_setzero_pd();
1815             fjz2             = _mm_setzero_pd();
1816
1817             /**************************
1818              * CALCULATE INTERACTIONS *
1819              **************************/
1820
1821             if (gmx_mm_any_lt(rsq00,rcutoff2))
1822             {
1823
1824             r00              = _mm_mul_pd(rsq00,rinv00);
1825
1826             /* EWALD ELECTROSTATICS */
1827
1828             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1829             ewrt             = _mm_mul_pd(r00,ewtabscale);
1830             ewitab           = _mm_cvttpd_epi32(ewrt);
1831             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1832             ewitab           = _mm_slli_epi32(ewitab,2);
1833             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1834             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1835             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1836             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1837             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1838             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1839             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1840             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1841             velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1842             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1843
1844             d                = _mm_sub_pd(r00,rswitch);
1845             d                = _mm_max_pd(d,_mm_setzero_pd());
1846             d2               = _mm_mul_pd(d,d);
1847             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1848
1849             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1850
1851             /* Evaluate switch function */
1852             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1853             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1854             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
1855
1856             fscal            = felec;
1857
1858             fscal            = _mm_and_pd(fscal,cutoff_mask);
1859
1860             /* Calculate temporary vectorial force */
1861             tx               = _mm_mul_pd(fscal,dx00);
1862             ty               = _mm_mul_pd(fscal,dy00);
1863             tz               = _mm_mul_pd(fscal,dz00);
1864
1865             /* Update vectorial force */
1866             fix0             = _mm_add_pd(fix0,tx);
1867             fiy0             = _mm_add_pd(fiy0,ty);
1868             fiz0             = _mm_add_pd(fiz0,tz);
1869
1870             fjx0             = _mm_add_pd(fjx0,tx);
1871             fjy0             = _mm_add_pd(fjy0,ty);
1872             fjz0             = _mm_add_pd(fjz0,tz);
1873
1874             }
1875
1876             /**************************
1877              * CALCULATE INTERACTIONS *
1878              **************************/
1879
1880             if (gmx_mm_any_lt(rsq01,rcutoff2))
1881             {
1882
1883             r01              = _mm_mul_pd(rsq01,rinv01);
1884
1885             /* EWALD ELECTROSTATICS */
1886
1887             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888             ewrt             = _mm_mul_pd(r01,ewtabscale);
1889             ewitab           = _mm_cvttpd_epi32(ewrt);
1890             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1891             ewitab           = _mm_slli_epi32(ewitab,2);
1892             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1893             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1894             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1895             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1896             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1897             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1898             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1899             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1900             velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1901             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1902
1903             d                = _mm_sub_pd(r01,rswitch);
1904             d                = _mm_max_pd(d,_mm_setzero_pd());
1905             d2               = _mm_mul_pd(d,d);
1906             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1907
1908             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1909
1910             /* Evaluate switch function */
1911             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1912             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1913             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
1914
1915             fscal            = felec;
1916
1917             fscal            = _mm_and_pd(fscal,cutoff_mask);
1918
1919             /* Calculate temporary vectorial force */
1920             tx               = _mm_mul_pd(fscal,dx01);
1921             ty               = _mm_mul_pd(fscal,dy01);
1922             tz               = _mm_mul_pd(fscal,dz01);
1923
1924             /* Update vectorial force */
1925             fix0             = _mm_add_pd(fix0,tx);
1926             fiy0             = _mm_add_pd(fiy0,ty);
1927             fiz0             = _mm_add_pd(fiz0,tz);
1928
1929             fjx1             = _mm_add_pd(fjx1,tx);
1930             fjy1             = _mm_add_pd(fjy1,ty);
1931             fjz1             = _mm_add_pd(fjz1,tz);
1932
1933             }
1934
1935             /**************************
1936              * CALCULATE INTERACTIONS *
1937              **************************/
1938
1939             if (gmx_mm_any_lt(rsq02,rcutoff2))
1940             {
1941
1942             r02              = _mm_mul_pd(rsq02,rinv02);
1943
1944             /* EWALD ELECTROSTATICS */
1945
1946             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1947             ewrt             = _mm_mul_pd(r02,ewtabscale);
1948             ewitab           = _mm_cvttpd_epi32(ewrt);
1949             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1950             ewitab           = _mm_slli_epi32(ewitab,2);
1951             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1952             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1953             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1954             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1955             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1956             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1957             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1958             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1959             velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1960             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1961
1962             d                = _mm_sub_pd(r02,rswitch);
1963             d                = _mm_max_pd(d,_mm_setzero_pd());
1964             d2               = _mm_mul_pd(d,d);
1965             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1966
1967             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1968
1969             /* Evaluate switch function */
1970             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1971             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1972             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
1973
1974             fscal            = felec;
1975
1976             fscal            = _mm_and_pd(fscal,cutoff_mask);
1977
1978             /* Calculate temporary vectorial force */
1979             tx               = _mm_mul_pd(fscal,dx02);
1980             ty               = _mm_mul_pd(fscal,dy02);
1981             tz               = _mm_mul_pd(fscal,dz02);
1982
1983             /* Update vectorial force */
1984             fix0             = _mm_add_pd(fix0,tx);
1985             fiy0             = _mm_add_pd(fiy0,ty);
1986             fiz0             = _mm_add_pd(fiz0,tz);
1987
1988             fjx2             = _mm_add_pd(fjx2,tx);
1989             fjy2             = _mm_add_pd(fjy2,ty);
1990             fjz2             = _mm_add_pd(fjz2,tz);
1991
1992             }
1993
1994             /**************************
1995              * CALCULATE INTERACTIONS *
1996              **************************/
1997
1998             if (gmx_mm_any_lt(rsq10,rcutoff2))
1999             {
2000
2001             r10              = _mm_mul_pd(rsq10,rinv10);
2002
2003             /* EWALD ELECTROSTATICS */
2004
2005             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2006             ewrt             = _mm_mul_pd(r10,ewtabscale);
2007             ewitab           = _mm_cvttpd_epi32(ewrt);
2008             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2009             ewitab           = _mm_slli_epi32(ewitab,2);
2010             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2011             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2012             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2013             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2014             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2015             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2016             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2017             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2018             velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2019             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2020
2021             d                = _mm_sub_pd(r10,rswitch);
2022             d                = _mm_max_pd(d,_mm_setzero_pd());
2023             d2               = _mm_mul_pd(d,d);
2024             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2025
2026             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2027
2028             /* Evaluate switch function */
2029             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2030             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2031             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
2032
2033             fscal            = felec;
2034
2035             fscal            = _mm_and_pd(fscal,cutoff_mask);
2036
2037             /* Calculate temporary vectorial force */
2038             tx               = _mm_mul_pd(fscal,dx10);
2039             ty               = _mm_mul_pd(fscal,dy10);
2040             tz               = _mm_mul_pd(fscal,dz10);
2041
2042             /* Update vectorial force */
2043             fix1             = _mm_add_pd(fix1,tx);
2044             fiy1             = _mm_add_pd(fiy1,ty);
2045             fiz1             = _mm_add_pd(fiz1,tz);
2046
2047             fjx0             = _mm_add_pd(fjx0,tx);
2048             fjy0             = _mm_add_pd(fjy0,ty);
2049             fjz0             = _mm_add_pd(fjz0,tz);
2050
2051             }
2052
2053             /**************************
2054              * CALCULATE INTERACTIONS *
2055              **************************/
2056
2057             if (gmx_mm_any_lt(rsq11,rcutoff2))
2058             {
2059
2060             r11              = _mm_mul_pd(rsq11,rinv11);
2061
2062             /* EWALD ELECTROSTATICS */
2063
2064             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2065             ewrt             = _mm_mul_pd(r11,ewtabscale);
2066             ewitab           = _mm_cvttpd_epi32(ewrt);
2067             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2068             ewitab           = _mm_slli_epi32(ewitab,2);
2069             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2070             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2071             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2072             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2073             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2074             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2075             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2076             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2077             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2078             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2079
2080             d                = _mm_sub_pd(r11,rswitch);
2081             d                = _mm_max_pd(d,_mm_setzero_pd());
2082             d2               = _mm_mul_pd(d,d);
2083             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2084
2085             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2086
2087             /* Evaluate switch function */
2088             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2089             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2090             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2091
2092             fscal            = felec;
2093
2094             fscal            = _mm_and_pd(fscal,cutoff_mask);
2095
2096             /* Calculate temporary vectorial force */
2097             tx               = _mm_mul_pd(fscal,dx11);
2098             ty               = _mm_mul_pd(fscal,dy11);
2099             tz               = _mm_mul_pd(fscal,dz11);
2100
2101             /* Update vectorial force */
2102             fix1             = _mm_add_pd(fix1,tx);
2103             fiy1             = _mm_add_pd(fiy1,ty);
2104             fiz1             = _mm_add_pd(fiz1,tz);
2105
2106             fjx1             = _mm_add_pd(fjx1,tx);
2107             fjy1             = _mm_add_pd(fjy1,ty);
2108             fjz1             = _mm_add_pd(fjz1,tz);
2109
2110             }
2111
2112             /**************************
2113              * CALCULATE INTERACTIONS *
2114              **************************/
2115
2116             if (gmx_mm_any_lt(rsq12,rcutoff2))
2117             {
2118
2119             r12              = _mm_mul_pd(rsq12,rinv12);
2120
2121             /* EWALD ELECTROSTATICS */
2122
2123             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124             ewrt             = _mm_mul_pd(r12,ewtabscale);
2125             ewitab           = _mm_cvttpd_epi32(ewrt);
2126             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2127             ewitab           = _mm_slli_epi32(ewitab,2);
2128             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2129             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2130             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2131             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2132             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2133             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2134             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2135             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2136             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2137             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2138
2139             d                = _mm_sub_pd(r12,rswitch);
2140             d                = _mm_max_pd(d,_mm_setzero_pd());
2141             d2               = _mm_mul_pd(d,d);
2142             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2143
2144             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2145
2146             /* Evaluate switch function */
2147             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2148             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2149             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2150
2151             fscal            = felec;
2152
2153             fscal            = _mm_and_pd(fscal,cutoff_mask);
2154
2155             /* Calculate temporary vectorial force */
2156             tx               = _mm_mul_pd(fscal,dx12);
2157             ty               = _mm_mul_pd(fscal,dy12);
2158             tz               = _mm_mul_pd(fscal,dz12);
2159
2160             /* Update vectorial force */
2161             fix1             = _mm_add_pd(fix1,tx);
2162             fiy1             = _mm_add_pd(fiy1,ty);
2163             fiz1             = _mm_add_pd(fiz1,tz);
2164
2165             fjx2             = _mm_add_pd(fjx2,tx);
2166             fjy2             = _mm_add_pd(fjy2,ty);
2167             fjz2             = _mm_add_pd(fjz2,tz);
2168
2169             }
2170
2171             /**************************
2172              * CALCULATE INTERACTIONS *
2173              **************************/
2174
2175             if (gmx_mm_any_lt(rsq20,rcutoff2))
2176             {
2177
2178             r20              = _mm_mul_pd(rsq20,rinv20);
2179
2180             /* EWALD ELECTROSTATICS */
2181
2182             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2183             ewrt             = _mm_mul_pd(r20,ewtabscale);
2184             ewitab           = _mm_cvttpd_epi32(ewrt);
2185             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2186             ewitab           = _mm_slli_epi32(ewitab,2);
2187             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2188             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2189             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2190             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2191             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2192             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2193             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2194             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2195             velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2196             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2197
2198             d                = _mm_sub_pd(r20,rswitch);
2199             d                = _mm_max_pd(d,_mm_setzero_pd());
2200             d2               = _mm_mul_pd(d,d);
2201             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2202
2203             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2204
2205             /* Evaluate switch function */
2206             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2207             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2208             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
2209
2210             fscal            = felec;
2211
2212             fscal            = _mm_and_pd(fscal,cutoff_mask);
2213
2214             /* Calculate temporary vectorial force */
2215             tx               = _mm_mul_pd(fscal,dx20);
2216             ty               = _mm_mul_pd(fscal,dy20);
2217             tz               = _mm_mul_pd(fscal,dz20);
2218
2219             /* Update vectorial force */
2220             fix2             = _mm_add_pd(fix2,tx);
2221             fiy2             = _mm_add_pd(fiy2,ty);
2222             fiz2             = _mm_add_pd(fiz2,tz);
2223
2224             fjx0             = _mm_add_pd(fjx0,tx);
2225             fjy0             = _mm_add_pd(fjy0,ty);
2226             fjz0             = _mm_add_pd(fjz0,tz);
2227
2228             }
2229
2230             /**************************
2231              * CALCULATE INTERACTIONS *
2232              **************************/
2233
2234             if (gmx_mm_any_lt(rsq21,rcutoff2))
2235             {
2236
2237             r21              = _mm_mul_pd(rsq21,rinv21);
2238
2239             /* EWALD ELECTROSTATICS */
2240
2241             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2242             ewrt             = _mm_mul_pd(r21,ewtabscale);
2243             ewitab           = _mm_cvttpd_epi32(ewrt);
2244             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2245             ewitab           = _mm_slli_epi32(ewitab,2);
2246             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2247             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2248             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2249             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2250             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2251             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2252             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2253             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2254             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2255             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2256
2257             d                = _mm_sub_pd(r21,rswitch);
2258             d                = _mm_max_pd(d,_mm_setzero_pd());
2259             d2               = _mm_mul_pd(d,d);
2260             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2261
2262             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2263
2264             /* Evaluate switch function */
2265             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2266             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2267             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2268
2269             fscal            = felec;
2270
2271             fscal            = _mm_and_pd(fscal,cutoff_mask);
2272
2273             /* Calculate temporary vectorial force */
2274             tx               = _mm_mul_pd(fscal,dx21);
2275             ty               = _mm_mul_pd(fscal,dy21);
2276             tz               = _mm_mul_pd(fscal,dz21);
2277
2278             /* Update vectorial force */
2279             fix2             = _mm_add_pd(fix2,tx);
2280             fiy2             = _mm_add_pd(fiy2,ty);
2281             fiz2             = _mm_add_pd(fiz2,tz);
2282
2283             fjx1             = _mm_add_pd(fjx1,tx);
2284             fjy1             = _mm_add_pd(fjy1,ty);
2285             fjz1             = _mm_add_pd(fjz1,tz);
2286
2287             }
2288
2289             /**************************
2290              * CALCULATE INTERACTIONS *
2291              **************************/
2292
2293             if (gmx_mm_any_lt(rsq22,rcutoff2))
2294             {
2295
2296             r22              = _mm_mul_pd(rsq22,rinv22);
2297
2298             /* EWALD ELECTROSTATICS */
2299
2300             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2301             ewrt             = _mm_mul_pd(r22,ewtabscale);
2302             ewitab           = _mm_cvttpd_epi32(ewrt);
2303             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2304             ewitab           = _mm_slli_epi32(ewitab,2);
2305             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2306             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2307             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2308             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2309             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2310             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2311             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2312             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2313             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2314             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2315
2316             d                = _mm_sub_pd(r22,rswitch);
2317             d                = _mm_max_pd(d,_mm_setzero_pd());
2318             d2               = _mm_mul_pd(d,d);
2319             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2320
2321             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2322
2323             /* Evaluate switch function */
2324             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2325             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2326             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2327
2328             fscal            = felec;
2329
2330             fscal            = _mm_and_pd(fscal,cutoff_mask);
2331
2332             /* Calculate temporary vectorial force */
2333             tx               = _mm_mul_pd(fscal,dx22);
2334             ty               = _mm_mul_pd(fscal,dy22);
2335             tz               = _mm_mul_pd(fscal,dz22);
2336
2337             /* Update vectorial force */
2338             fix2             = _mm_add_pd(fix2,tx);
2339             fiy2             = _mm_add_pd(fiy2,ty);
2340             fiz2             = _mm_add_pd(fiz2,tz);
2341
2342             fjx2             = _mm_add_pd(fjx2,tx);
2343             fjy2             = _mm_add_pd(fjy2,ty);
2344             fjz2             = _mm_add_pd(fjz2,tz);
2345
2346             }
2347
2348             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2349
2350             /* Inner loop uses 558 flops */
2351         }
2352
2353         if(jidx<j_index_end)
2354         {
2355
2356             jnrA             = jjnr[jidx];
2357             j_coord_offsetA  = DIM*jnrA;
2358
2359             /* load j atom coordinates */
2360             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2361                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2362
2363             /* Calculate displacement vector */
2364             dx00             = _mm_sub_pd(ix0,jx0);
2365             dy00             = _mm_sub_pd(iy0,jy0);
2366             dz00             = _mm_sub_pd(iz0,jz0);
2367             dx01             = _mm_sub_pd(ix0,jx1);
2368             dy01             = _mm_sub_pd(iy0,jy1);
2369             dz01             = _mm_sub_pd(iz0,jz1);
2370             dx02             = _mm_sub_pd(ix0,jx2);
2371             dy02             = _mm_sub_pd(iy0,jy2);
2372             dz02             = _mm_sub_pd(iz0,jz2);
2373             dx10             = _mm_sub_pd(ix1,jx0);
2374             dy10             = _mm_sub_pd(iy1,jy0);
2375             dz10             = _mm_sub_pd(iz1,jz0);
2376             dx11             = _mm_sub_pd(ix1,jx1);
2377             dy11             = _mm_sub_pd(iy1,jy1);
2378             dz11             = _mm_sub_pd(iz1,jz1);
2379             dx12             = _mm_sub_pd(ix1,jx2);
2380             dy12             = _mm_sub_pd(iy1,jy2);
2381             dz12             = _mm_sub_pd(iz1,jz2);
2382             dx20             = _mm_sub_pd(ix2,jx0);
2383             dy20             = _mm_sub_pd(iy2,jy0);
2384             dz20             = _mm_sub_pd(iz2,jz0);
2385             dx21             = _mm_sub_pd(ix2,jx1);
2386             dy21             = _mm_sub_pd(iy2,jy1);
2387             dz21             = _mm_sub_pd(iz2,jz1);
2388             dx22             = _mm_sub_pd(ix2,jx2);
2389             dy22             = _mm_sub_pd(iy2,jy2);
2390             dz22             = _mm_sub_pd(iz2,jz2);
2391
2392             /* Calculate squared distance and things based on it */
2393             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2394             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2395             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2396             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2397             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2398             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2399             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2400             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2401             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2402
2403             rinv00           = sse2_invsqrt_d(rsq00);
2404             rinv01           = sse2_invsqrt_d(rsq01);
2405             rinv02           = sse2_invsqrt_d(rsq02);
2406             rinv10           = sse2_invsqrt_d(rsq10);
2407             rinv11           = sse2_invsqrt_d(rsq11);
2408             rinv12           = sse2_invsqrt_d(rsq12);
2409             rinv20           = sse2_invsqrt_d(rsq20);
2410             rinv21           = sse2_invsqrt_d(rsq21);
2411             rinv22           = sse2_invsqrt_d(rsq22);
2412
2413             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
2414             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
2415             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
2416             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
2417             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2418             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2419             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
2420             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2421             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2422
2423             fjx0             = _mm_setzero_pd();
2424             fjy0             = _mm_setzero_pd();
2425             fjz0             = _mm_setzero_pd();
2426             fjx1             = _mm_setzero_pd();
2427             fjy1             = _mm_setzero_pd();
2428             fjz1             = _mm_setzero_pd();
2429             fjx2             = _mm_setzero_pd();
2430             fjy2             = _mm_setzero_pd();
2431             fjz2             = _mm_setzero_pd();
2432
2433             /**************************
2434              * CALCULATE INTERACTIONS *
2435              **************************/
2436
2437             if (gmx_mm_any_lt(rsq00,rcutoff2))
2438             {
2439
2440             r00              = _mm_mul_pd(rsq00,rinv00);
2441
2442             /* EWALD ELECTROSTATICS */
2443
2444             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2445             ewrt             = _mm_mul_pd(r00,ewtabscale);
2446             ewitab           = _mm_cvttpd_epi32(ewrt);
2447             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2448             ewitab           = _mm_slli_epi32(ewitab,2);
2449             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2450             ewtabD           = _mm_setzero_pd();
2451             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2452             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2453             ewtabFn          = _mm_setzero_pd();
2454             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2455             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2456             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2457             velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
2458             felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2459
2460             d                = _mm_sub_pd(r00,rswitch);
2461             d                = _mm_max_pd(d,_mm_setzero_pd());
2462             d2               = _mm_mul_pd(d,d);
2463             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2464
2465             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2466
2467             /* Evaluate switch function */
2468             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2469             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
2470             cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
2471
2472             fscal            = felec;
2473
2474             fscal            = _mm_and_pd(fscal,cutoff_mask);
2475
2476             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2477
2478             /* Calculate temporary vectorial force */
2479             tx               = _mm_mul_pd(fscal,dx00);
2480             ty               = _mm_mul_pd(fscal,dy00);
2481             tz               = _mm_mul_pd(fscal,dz00);
2482
2483             /* Update vectorial force */
2484             fix0             = _mm_add_pd(fix0,tx);
2485             fiy0             = _mm_add_pd(fiy0,ty);
2486             fiz0             = _mm_add_pd(fiz0,tz);
2487
2488             fjx0             = _mm_add_pd(fjx0,tx);
2489             fjy0             = _mm_add_pd(fjy0,ty);
2490             fjz0             = _mm_add_pd(fjz0,tz);
2491
2492             }
2493
2494             /**************************
2495              * CALCULATE INTERACTIONS *
2496              **************************/
2497
2498             if (gmx_mm_any_lt(rsq01,rcutoff2))
2499             {
2500
2501             r01              = _mm_mul_pd(rsq01,rinv01);
2502
2503             /* EWALD ELECTROSTATICS */
2504
2505             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2506             ewrt             = _mm_mul_pd(r01,ewtabscale);
2507             ewitab           = _mm_cvttpd_epi32(ewrt);
2508             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2509             ewitab           = _mm_slli_epi32(ewitab,2);
2510             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2511             ewtabD           = _mm_setzero_pd();
2512             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2513             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2514             ewtabFn          = _mm_setzero_pd();
2515             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2516             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2517             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2518             velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
2519             felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2520
2521             d                = _mm_sub_pd(r01,rswitch);
2522             d                = _mm_max_pd(d,_mm_setzero_pd());
2523             d2               = _mm_mul_pd(d,d);
2524             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2525
2526             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2527
2528             /* Evaluate switch function */
2529             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2530             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
2531             cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
2532
2533             fscal            = felec;
2534
2535             fscal            = _mm_and_pd(fscal,cutoff_mask);
2536
2537             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2538
2539             /* Calculate temporary vectorial force */
2540             tx               = _mm_mul_pd(fscal,dx01);
2541             ty               = _mm_mul_pd(fscal,dy01);
2542             tz               = _mm_mul_pd(fscal,dz01);
2543
2544             /* Update vectorial force */
2545             fix0             = _mm_add_pd(fix0,tx);
2546             fiy0             = _mm_add_pd(fiy0,ty);
2547             fiz0             = _mm_add_pd(fiz0,tz);
2548
2549             fjx1             = _mm_add_pd(fjx1,tx);
2550             fjy1             = _mm_add_pd(fjy1,ty);
2551             fjz1             = _mm_add_pd(fjz1,tz);
2552
2553             }
2554
2555             /**************************
2556              * CALCULATE INTERACTIONS *
2557              **************************/
2558
2559             if (gmx_mm_any_lt(rsq02,rcutoff2))
2560             {
2561
2562             r02              = _mm_mul_pd(rsq02,rinv02);
2563
2564             /* EWALD ELECTROSTATICS */
2565
2566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2567             ewrt             = _mm_mul_pd(r02,ewtabscale);
2568             ewitab           = _mm_cvttpd_epi32(ewrt);
2569             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2570             ewitab           = _mm_slli_epi32(ewitab,2);
2571             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2572             ewtabD           = _mm_setzero_pd();
2573             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2574             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2575             ewtabFn          = _mm_setzero_pd();
2576             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2577             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2578             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2579             velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2580             felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2581
2582             d                = _mm_sub_pd(r02,rswitch);
2583             d                = _mm_max_pd(d,_mm_setzero_pd());
2584             d2               = _mm_mul_pd(d,d);
2585             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2586
2587             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2588
2589             /* Evaluate switch function */
2590             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2591             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2592             cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
2593
2594             fscal            = felec;
2595
2596             fscal            = _mm_and_pd(fscal,cutoff_mask);
2597
2598             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2599
2600             /* Calculate temporary vectorial force */
2601             tx               = _mm_mul_pd(fscal,dx02);
2602             ty               = _mm_mul_pd(fscal,dy02);
2603             tz               = _mm_mul_pd(fscal,dz02);
2604
2605             /* Update vectorial force */
2606             fix0             = _mm_add_pd(fix0,tx);
2607             fiy0             = _mm_add_pd(fiy0,ty);
2608             fiz0             = _mm_add_pd(fiz0,tz);
2609
2610             fjx2             = _mm_add_pd(fjx2,tx);
2611             fjy2             = _mm_add_pd(fjy2,ty);
2612             fjz2             = _mm_add_pd(fjz2,tz);
2613
2614             }
2615
2616             /**************************
2617              * CALCULATE INTERACTIONS *
2618              **************************/
2619
2620             if (gmx_mm_any_lt(rsq10,rcutoff2))
2621             {
2622
2623             r10              = _mm_mul_pd(rsq10,rinv10);
2624
2625             /* EWALD ELECTROSTATICS */
2626
2627             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2628             ewrt             = _mm_mul_pd(r10,ewtabscale);
2629             ewitab           = _mm_cvttpd_epi32(ewrt);
2630             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2631             ewitab           = _mm_slli_epi32(ewitab,2);
2632             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2633             ewtabD           = _mm_setzero_pd();
2634             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2635             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2636             ewtabFn          = _mm_setzero_pd();
2637             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2638             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2639             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2640             velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2641             felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2642
2643             d                = _mm_sub_pd(r10,rswitch);
2644             d                = _mm_max_pd(d,_mm_setzero_pd());
2645             d2               = _mm_mul_pd(d,d);
2646             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2647
2648             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2649
2650             /* Evaluate switch function */
2651             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2652             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2653             cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
2654
2655             fscal            = felec;
2656
2657             fscal            = _mm_and_pd(fscal,cutoff_mask);
2658
2659             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2660
2661             /* Calculate temporary vectorial force */
2662             tx               = _mm_mul_pd(fscal,dx10);
2663             ty               = _mm_mul_pd(fscal,dy10);
2664             tz               = _mm_mul_pd(fscal,dz10);
2665
2666             /* Update vectorial force */
2667             fix1             = _mm_add_pd(fix1,tx);
2668             fiy1             = _mm_add_pd(fiy1,ty);
2669             fiz1             = _mm_add_pd(fiz1,tz);
2670
2671             fjx0             = _mm_add_pd(fjx0,tx);
2672             fjy0             = _mm_add_pd(fjy0,ty);
2673             fjz0             = _mm_add_pd(fjz0,tz);
2674
2675             }
2676
2677             /**************************
2678              * CALCULATE INTERACTIONS *
2679              **************************/
2680
2681             if (gmx_mm_any_lt(rsq11,rcutoff2))
2682             {
2683
2684             r11              = _mm_mul_pd(rsq11,rinv11);
2685
2686             /* EWALD ELECTROSTATICS */
2687
2688             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2689             ewrt             = _mm_mul_pd(r11,ewtabscale);
2690             ewitab           = _mm_cvttpd_epi32(ewrt);
2691             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2692             ewitab           = _mm_slli_epi32(ewitab,2);
2693             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2694             ewtabD           = _mm_setzero_pd();
2695             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2696             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2697             ewtabFn          = _mm_setzero_pd();
2698             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2699             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2700             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2701             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2702             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2703
2704             d                = _mm_sub_pd(r11,rswitch);
2705             d                = _mm_max_pd(d,_mm_setzero_pd());
2706             d2               = _mm_mul_pd(d,d);
2707             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2708
2709             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2710
2711             /* Evaluate switch function */
2712             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2713             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2714             cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
2715
2716             fscal            = felec;
2717
2718             fscal            = _mm_and_pd(fscal,cutoff_mask);
2719
2720             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2721
2722             /* Calculate temporary vectorial force */
2723             tx               = _mm_mul_pd(fscal,dx11);
2724             ty               = _mm_mul_pd(fscal,dy11);
2725             tz               = _mm_mul_pd(fscal,dz11);
2726
2727             /* Update vectorial force */
2728             fix1             = _mm_add_pd(fix1,tx);
2729             fiy1             = _mm_add_pd(fiy1,ty);
2730             fiz1             = _mm_add_pd(fiz1,tz);
2731
2732             fjx1             = _mm_add_pd(fjx1,tx);
2733             fjy1             = _mm_add_pd(fjy1,ty);
2734             fjz1             = _mm_add_pd(fjz1,tz);
2735
2736             }
2737
2738             /**************************
2739              * CALCULATE INTERACTIONS *
2740              **************************/
2741
2742             if (gmx_mm_any_lt(rsq12,rcutoff2))
2743             {
2744
2745             r12              = _mm_mul_pd(rsq12,rinv12);
2746
2747             /* EWALD ELECTROSTATICS */
2748
2749             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2750             ewrt             = _mm_mul_pd(r12,ewtabscale);
2751             ewitab           = _mm_cvttpd_epi32(ewrt);
2752             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2753             ewitab           = _mm_slli_epi32(ewitab,2);
2754             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2755             ewtabD           = _mm_setzero_pd();
2756             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2757             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2758             ewtabFn          = _mm_setzero_pd();
2759             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2760             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2761             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2762             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2763             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2764
2765             d                = _mm_sub_pd(r12,rswitch);
2766             d                = _mm_max_pd(d,_mm_setzero_pd());
2767             d2               = _mm_mul_pd(d,d);
2768             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2769
2770             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2771
2772             /* Evaluate switch function */
2773             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2774             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2775             cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
2776
2777             fscal            = felec;
2778
2779             fscal            = _mm_and_pd(fscal,cutoff_mask);
2780
2781             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2782
2783             /* Calculate temporary vectorial force */
2784             tx               = _mm_mul_pd(fscal,dx12);
2785             ty               = _mm_mul_pd(fscal,dy12);
2786             tz               = _mm_mul_pd(fscal,dz12);
2787
2788             /* Update vectorial force */
2789             fix1             = _mm_add_pd(fix1,tx);
2790             fiy1             = _mm_add_pd(fiy1,ty);
2791             fiz1             = _mm_add_pd(fiz1,tz);
2792
2793             fjx2             = _mm_add_pd(fjx2,tx);
2794             fjy2             = _mm_add_pd(fjy2,ty);
2795             fjz2             = _mm_add_pd(fjz2,tz);
2796
2797             }
2798
2799             /**************************
2800              * CALCULATE INTERACTIONS *
2801              **************************/
2802
2803             if (gmx_mm_any_lt(rsq20,rcutoff2))
2804             {
2805
2806             r20              = _mm_mul_pd(rsq20,rinv20);
2807
2808             /* EWALD ELECTROSTATICS */
2809
2810             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2811             ewrt             = _mm_mul_pd(r20,ewtabscale);
2812             ewitab           = _mm_cvttpd_epi32(ewrt);
2813             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2814             ewitab           = _mm_slli_epi32(ewitab,2);
2815             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2816             ewtabD           = _mm_setzero_pd();
2817             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2818             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2819             ewtabFn          = _mm_setzero_pd();
2820             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2821             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2822             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2823             velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2824             felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2825
2826             d                = _mm_sub_pd(r20,rswitch);
2827             d                = _mm_max_pd(d,_mm_setzero_pd());
2828             d2               = _mm_mul_pd(d,d);
2829             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2830
2831             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2832
2833             /* Evaluate switch function */
2834             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2835             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2836             cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
2837
2838             fscal            = felec;
2839
2840             fscal            = _mm_and_pd(fscal,cutoff_mask);
2841
2842             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2843
2844             /* Calculate temporary vectorial force */
2845             tx               = _mm_mul_pd(fscal,dx20);
2846             ty               = _mm_mul_pd(fscal,dy20);
2847             tz               = _mm_mul_pd(fscal,dz20);
2848
2849             /* Update vectorial force */
2850             fix2             = _mm_add_pd(fix2,tx);
2851             fiy2             = _mm_add_pd(fiy2,ty);
2852             fiz2             = _mm_add_pd(fiz2,tz);
2853
2854             fjx0             = _mm_add_pd(fjx0,tx);
2855             fjy0             = _mm_add_pd(fjy0,ty);
2856             fjz0             = _mm_add_pd(fjz0,tz);
2857
2858             }
2859
2860             /**************************
2861              * CALCULATE INTERACTIONS *
2862              **************************/
2863
2864             if (gmx_mm_any_lt(rsq21,rcutoff2))
2865             {
2866
2867             r21              = _mm_mul_pd(rsq21,rinv21);
2868
2869             /* EWALD ELECTROSTATICS */
2870
2871             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2872             ewrt             = _mm_mul_pd(r21,ewtabscale);
2873             ewitab           = _mm_cvttpd_epi32(ewrt);
2874             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2875             ewitab           = _mm_slli_epi32(ewitab,2);
2876             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2877             ewtabD           = _mm_setzero_pd();
2878             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2879             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2880             ewtabFn          = _mm_setzero_pd();
2881             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2882             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2883             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2884             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2885             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2886
2887             d                = _mm_sub_pd(r21,rswitch);
2888             d                = _mm_max_pd(d,_mm_setzero_pd());
2889             d2               = _mm_mul_pd(d,d);
2890             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2891
2892             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2893
2894             /* Evaluate switch function */
2895             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2896             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2897             cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
2898
2899             fscal            = felec;
2900
2901             fscal            = _mm_and_pd(fscal,cutoff_mask);
2902
2903             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2904
2905             /* Calculate temporary vectorial force */
2906             tx               = _mm_mul_pd(fscal,dx21);
2907             ty               = _mm_mul_pd(fscal,dy21);
2908             tz               = _mm_mul_pd(fscal,dz21);
2909
2910             /* Update vectorial force */
2911             fix2             = _mm_add_pd(fix2,tx);
2912             fiy2             = _mm_add_pd(fiy2,ty);
2913             fiz2             = _mm_add_pd(fiz2,tz);
2914
2915             fjx1             = _mm_add_pd(fjx1,tx);
2916             fjy1             = _mm_add_pd(fjy1,ty);
2917             fjz1             = _mm_add_pd(fjz1,tz);
2918
2919             }
2920
2921             /**************************
2922              * CALCULATE INTERACTIONS *
2923              **************************/
2924
2925             if (gmx_mm_any_lt(rsq22,rcutoff2))
2926             {
2927
2928             r22              = _mm_mul_pd(rsq22,rinv22);
2929
2930             /* EWALD ELECTROSTATICS */
2931
2932             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2933             ewrt             = _mm_mul_pd(r22,ewtabscale);
2934             ewitab           = _mm_cvttpd_epi32(ewrt);
2935             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2936             ewitab           = _mm_slli_epi32(ewitab,2);
2937             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2938             ewtabD           = _mm_setzero_pd();
2939             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2940             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2941             ewtabFn          = _mm_setzero_pd();
2942             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2943             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2944             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2945             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2946             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2947
2948             d                = _mm_sub_pd(r22,rswitch);
2949             d                = _mm_max_pd(d,_mm_setzero_pd());
2950             d2               = _mm_mul_pd(d,d);
2951             sw               = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2952
2953             dsw              = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2954
2955             /* Evaluate switch function */
2956             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2957             felec            = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2958             cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
2959
2960             fscal            = felec;
2961
2962             fscal            = _mm_and_pd(fscal,cutoff_mask);
2963
2964             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2965
2966             /* Calculate temporary vectorial force */
2967             tx               = _mm_mul_pd(fscal,dx22);
2968             ty               = _mm_mul_pd(fscal,dy22);
2969             tz               = _mm_mul_pd(fscal,dz22);
2970
2971             /* Update vectorial force */
2972             fix2             = _mm_add_pd(fix2,tx);
2973             fiy2             = _mm_add_pd(fiy2,ty);
2974             fiz2             = _mm_add_pd(fiz2,tz);
2975
2976             fjx2             = _mm_add_pd(fjx2,tx);
2977             fjy2             = _mm_add_pd(fjy2,ty);
2978             fjz2             = _mm_add_pd(fjz2,tz);
2979
2980             }
2981
2982             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2983
2984             /* Inner loop uses 558 flops */
2985         }
2986
2987         /* End of innermost loop */
2988
2989         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2990                                               f+i_coord_offset,fshift+i_shift_offset);
2991
2992         /* Increment number of inner iterations */
2993         inneriter                  += j_index_end - j_index_start;
2994
2995         /* Outer loop uses 18 flops */
2996     }
2997
2998     /* Increment number of outer iterations */
2999     outeriter        += nri;
3000
3001     /* Update outer/inner flops */
3002
3003     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*558);
3004 }