Version bumps after new release
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse2_double.h"
50 #include "kernelutil_x86_sse2_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_double
54  * Electrostatics interaction: Coulomb
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
110     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
111     __m128d          dummy_mask,cutoff_mask;
112     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
113     __m128d          one     = _mm_set1_pd(1.0);
114     __m128d          two     = _mm_set1_pd(2.0);
115     x                = xx[0];
116     f                = ff[0];
117
118     nri              = nlist->nri;
119     iinr             = nlist->iinr;
120     jindex           = nlist->jindex;
121     jjnr             = nlist->jjnr;
122     shiftidx         = nlist->shift;
123     gid              = nlist->gid;
124     shiftvec         = fr->shift_vec[0];
125     fshift           = fr->fshift[0];
126     facel            = _mm_set1_pd(fr->epsfac);
127     charge           = mdatoms->chargeA;
128     nvdwtype         = fr->ntype;
129     vdwparam         = fr->nbfp;
130     vdwtype          = mdatoms->typeA;
131
132     /* Setup water-specific parameters */
133     inr              = nlist->iinr[0];
134     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
135     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
136     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
137     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
138
139     jq0              = _mm_set1_pd(charge[inr+0]);
140     jq1              = _mm_set1_pd(charge[inr+1]);
141     jq2              = _mm_set1_pd(charge[inr+2]);
142     vdwjidx0A        = 2*vdwtype[inr+0];
143     qq00             = _mm_mul_pd(iq0,jq0);
144     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
145     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
146     qq01             = _mm_mul_pd(iq0,jq1);
147     qq02             = _mm_mul_pd(iq0,jq2);
148     qq10             = _mm_mul_pd(iq1,jq0);
149     qq11             = _mm_mul_pd(iq1,jq1);
150     qq12             = _mm_mul_pd(iq1,jq2);
151     qq20             = _mm_mul_pd(iq2,jq0);
152     qq21             = _mm_mul_pd(iq2,jq1);
153     qq22             = _mm_mul_pd(iq2,jq2);
154
155     /* Avoid stupid compiler warnings */
156     jnrA = jnrB = 0;
157     j_coord_offsetA = 0;
158     j_coord_offsetB = 0;
159
160     outeriter        = 0;
161     inneriter        = 0;
162
163     /* Start outer loop over neighborlists */
164     for(iidx=0; iidx<nri; iidx++)
165     {
166         /* Load shift vector for this list */
167         i_shift_offset   = DIM*shiftidx[iidx];
168
169         /* Load limits for loop over neighbors */
170         j_index_start    = jindex[iidx];
171         j_index_end      = jindex[iidx+1];
172
173         /* Get outer coordinate index */
174         inr              = iinr[iidx];
175         i_coord_offset   = DIM*inr;
176
177         /* Load i particle coords and add shift vector */
178         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
179                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
180
181         fix0             = _mm_setzero_pd();
182         fiy0             = _mm_setzero_pd();
183         fiz0             = _mm_setzero_pd();
184         fix1             = _mm_setzero_pd();
185         fiy1             = _mm_setzero_pd();
186         fiz1             = _mm_setzero_pd();
187         fix2             = _mm_setzero_pd();
188         fiy2             = _mm_setzero_pd();
189         fiz2             = _mm_setzero_pd();
190
191         /* Reset potential sums */
192         velecsum         = _mm_setzero_pd();
193         vvdwsum          = _mm_setzero_pd();
194
195         /* Start inner kernel loop */
196         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
197         {
198
199             /* Get j neighbor index, and coordinate index */
200             jnrA             = jjnr[jidx];
201             jnrB             = jjnr[jidx+1];
202             j_coord_offsetA  = DIM*jnrA;
203             j_coord_offsetB  = DIM*jnrB;
204
205             /* load j atom coordinates */
206             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
207                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
208
209             /* Calculate displacement vector */
210             dx00             = _mm_sub_pd(ix0,jx0);
211             dy00             = _mm_sub_pd(iy0,jy0);
212             dz00             = _mm_sub_pd(iz0,jz0);
213             dx01             = _mm_sub_pd(ix0,jx1);
214             dy01             = _mm_sub_pd(iy0,jy1);
215             dz01             = _mm_sub_pd(iz0,jz1);
216             dx02             = _mm_sub_pd(ix0,jx2);
217             dy02             = _mm_sub_pd(iy0,jy2);
218             dz02             = _mm_sub_pd(iz0,jz2);
219             dx10             = _mm_sub_pd(ix1,jx0);
220             dy10             = _mm_sub_pd(iy1,jy0);
221             dz10             = _mm_sub_pd(iz1,jz0);
222             dx11             = _mm_sub_pd(ix1,jx1);
223             dy11             = _mm_sub_pd(iy1,jy1);
224             dz11             = _mm_sub_pd(iz1,jz1);
225             dx12             = _mm_sub_pd(ix1,jx2);
226             dy12             = _mm_sub_pd(iy1,jy2);
227             dz12             = _mm_sub_pd(iz1,jz2);
228             dx20             = _mm_sub_pd(ix2,jx0);
229             dy20             = _mm_sub_pd(iy2,jy0);
230             dz20             = _mm_sub_pd(iz2,jz0);
231             dx21             = _mm_sub_pd(ix2,jx1);
232             dy21             = _mm_sub_pd(iy2,jy1);
233             dz21             = _mm_sub_pd(iz2,jz1);
234             dx22             = _mm_sub_pd(ix2,jx2);
235             dy22             = _mm_sub_pd(iy2,jy2);
236             dz22             = _mm_sub_pd(iz2,jz2);
237
238             /* Calculate squared distance and things based on it */
239             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
240             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
241             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
242             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
243             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
244             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
245             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
246             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
247             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
248
249             rinv00           = gmx_mm_invsqrt_pd(rsq00);
250             rinv01           = gmx_mm_invsqrt_pd(rsq01);
251             rinv02           = gmx_mm_invsqrt_pd(rsq02);
252             rinv10           = gmx_mm_invsqrt_pd(rsq10);
253             rinv11           = gmx_mm_invsqrt_pd(rsq11);
254             rinv12           = gmx_mm_invsqrt_pd(rsq12);
255             rinv20           = gmx_mm_invsqrt_pd(rsq20);
256             rinv21           = gmx_mm_invsqrt_pd(rsq21);
257             rinv22           = gmx_mm_invsqrt_pd(rsq22);
258
259             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
260             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
261             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
262             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
263             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
264             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
265             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
266             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
267             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
268
269             fjx0             = _mm_setzero_pd();
270             fjy0             = _mm_setzero_pd();
271             fjz0             = _mm_setzero_pd();
272             fjx1             = _mm_setzero_pd();
273             fjy1             = _mm_setzero_pd();
274             fjz1             = _mm_setzero_pd();
275             fjx2             = _mm_setzero_pd();
276             fjy2             = _mm_setzero_pd();
277             fjz2             = _mm_setzero_pd();
278
279             /**************************
280              * CALCULATE INTERACTIONS *
281              **************************/
282
283             /* COULOMB ELECTROSTATICS */
284             velec            = _mm_mul_pd(qq00,rinv00);
285             felec            = _mm_mul_pd(velec,rinvsq00);
286
287             /* LENNARD-JONES DISPERSION/REPULSION */
288
289             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
290             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
291             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
292             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
293             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
294
295             /* Update potential sum for this i atom from the interaction with this j atom. */
296             velecsum         = _mm_add_pd(velecsum,velec);
297             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
298
299             fscal            = _mm_add_pd(felec,fvdw);
300
301             /* Calculate temporary vectorial force */
302             tx               = _mm_mul_pd(fscal,dx00);
303             ty               = _mm_mul_pd(fscal,dy00);
304             tz               = _mm_mul_pd(fscal,dz00);
305
306             /* Update vectorial force */
307             fix0             = _mm_add_pd(fix0,tx);
308             fiy0             = _mm_add_pd(fiy0,ty);
309             fiz0             = _mm_add_pd(fiz0,tz);
310
311             fjx0             = _mm_add_pd(fjx0,tx);
312             fjy0             = _mm_add_pd(fjy0,ty);
313             fjz0             = _mm_add_pd(fjz0,tz);
314
315             /**************************
316              * CALCULATE INTERACTIONS *
317              **************************/
318
319             /* COULOMB ELECTROSTATICS */
320             velec            = _mm_mul_pd(qq01,rinv01);
321             felec            = _mm_mul_pd(velec,rinvsq01);
322
323             /* Update potential sum for this i atom from the interaction with this j atom. */
324             velecsum         = _mm_add_pd(velecsum,velec);
325
326             fscal            = felec;
327
328             /* Calculate temporary vectorial force */
329             tx               = _mm_mul_pd(fscal,dx01);
330             ty               = _mm_mul_pd(fscal,dy01);
331             tz               = _mm_mul_pd(fscal,dz01);
332
333             /* Update vectorial force */
334             fix0             = _mm_add_pd(fix0,tx);
335             fiy0             = _mm_add_pd(fiy0,ty);
336             fiz0             = _mm_add_pd(fiz0,tz);
337
338             fjx1             = _mm_add_pd(fjx1,tx);
339             fjy1             = _mm_add_pd(fjy1,ty);
340             fjz1             = _mm_add_pd(fjz1,tz);
341
342             /**************************
343              * CALCULATE INTERACTIONS *
344              **************************/
345
346             /* COULOMB ELECTROSTATICS */
347             velec            = _mm_mul_pd(qq02,rinv02);
348             felec            = _mm_mul_pd(velec,rinvsq02);
349
350             /* Update potential sum for this i atom from the interaction with this j atom. */
351             velecsum         = _mm_add_pd(velecsum,velec);
352
353             fscal            = felec;
354
355             /* Calculate temporary vectorial force */
356             tx               = _mm_mul_pd(fscal,dx02);
357             ty               = _mm_mul_pd(fscal,dy02);
358             tz               = _mm_mul_pd(fscal,dz02);
359
360             /* Update vectorial force */
361             fix0             = _mm_add_pd(fix0,tx);
362             fiy0             = _mm_add_pd(fiy0,ty);
363             fiz0             = _mm_add_pd(fiz0,tz);
364
365             fjx2             = _mm_add_pd(fjx2,tx);
366             fjy2             = _mm_add_pd(fjy2,ty);
367             fjz2             = _mm_add_pd(fjz2,tz);
368
369             /**************************
370              * CALCULATE INTERACTIONS *
371              **************************/
372
373             /* COULOMB ELECTROSTATICS */
374             velec            = _mm_mul_pd(qq10,rinv10);
375             felec            = _mm_mul_pd(velec,rinvsq10);
376
377             /* Update potential sum for this i atom from the interaction with this j atom. */
378             velecsum         = _mm_add_pd(velecsum,velec);
379
380             fscal            = felec;
381
382             /* Calculate temporary vectorial force */
383             tx               = _mm_mul_pd(fscal,dx10);
384             ty               = _mm_mul_pd(fscal,dy10);
385             tz               = _mm_mul_pd(fscal,dz10);
386
387             /* Update vectorial force */
388             fix1             = _mm_add_pd(fix1,tx);
389             fiy1             = _mm_add_pd(fiy1,ty);
390             fiz1             = _mm_add_pd(fiz1,tz);
391
392             fjx0             = _mm_add_pd(fjx0,tx);
393             fjy0             = _mm_add_pd(fjy0,ty);
394             fjz0             = _mm_add_pd(fjz0,tz);
395
396             /**************************
397              * CALCULATE INTERACTIONS *
398              **************************/
399
400             /* COULOMB ELECTROSTATICS */
401             velec            = _mm_mul_pd(qq11,rinv11);
402             felec            = _mm_mul_pd(velec,rinvsq11);
403
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velecsum         = _mm_add_pd(velecsum,velec);
406
407             fscal            = felec;
408
409             /* Calculate temporary vectorial force */
410             tx               = _mm_mul_pd(fscal,dx11);
411             ty               = _mm_mul_pd(fscal,dy11);
412             tz               = _mm_mul_pd(fscal,dz11);
413
414             /* Update vectorial force */
415             fix1             = _mm_add_pd(fix1,tx);
416             fiy1             = _mm_add_pd(fiy1,ty);
417             fiz1             = _mm_add_pd(fiz1,tz);
418
419             fjx1             = _mm_add_pd(fjx1,tx);
420             fjy1             = _mm_add_pd(fjy1,ty);
421             fjz1             = _mm_add_pd(fjz1,tz);
422
423             /**************************
424              * CALCULATE INTERACTIONS *
425              **************************/
426
427             /* COULOMB ELECTROSTATICS */
428             velec            = _mm_mul_pd(qq12,rinv12);
429             felec            = _mm_mul_pd(velec,rinvsq12);
430
431             /* Update potential sum for this i atom from the interaction with this j atom. */
432             velecsum         = _mm_add_pd(velecsum,velec);
433
434             fscal            = felec;
435
436             /* Calculate temporary vectorial force */
437             tx               = _mm_mul_pd(fscal,dx12);
438             ty               = _mm_mul_pd(fscal,dy12);
439             tz               = _mm_mul_pd(fscal,dz12);
440
441             /* Update vectorial force */
442             fix1             = _mm_add_pd(fix1,tx);
443             fiy1             = _mm_add_pd(fiy1,ty);
444             fiz1             = _mm_add_pd(fiz1,tz);
445
446             fjx2             = _mm_add_pd(fjx2,tx);
447             fjy2             = _mm_add_pd(fjy2,ty);
448             fjz2             = _mm_add_pd(fjz2,tz);
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             /* COULOMB ELECTROSTATICS */
455             velec            = _mm_mul_pd(qq20,rinv20);
456             felec            = _mm_mul_pd(velec,rinvsq20);
457
458             /* Update potential sum for this i atom from the interaction with this j atom. */
459             velecsum         = _mm_add_pd(velecsum,velec);
460
461             fscal            = felec;
462
463             /* Calculate temporary vectorial force */
464             tx               = _mm_mul_pd(fscal,dx20);
465             ty               = _mm_mul_pd(fscal,dy20);
466             tz               = _mm_mul_pd(fscal,dz20);
467
468             /* Update vectorial force */
469             fix2             = _mm_add_pd(fix2,tx);
470             fiy2             = _mm_add_pd(fiy2,ty);
471             fiz2             = _mm_add_pd(fiz2,tz);
472
473             fjx0             = _mm_add_pd(fjx0,tx);
474             fjy0             = _mm_add_pd(fjy0,ty);
475             fjz0             = _mm_add_pd(fjz0,tz);
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             /* COULOMB ELECTROSTATICS */
482             velec            = _mm_mul_pd(qq21,rinv21);
483             felec            = _mm_mul_pd(velec,rinvsq21);
484
485             /* Update potential sum for this i atom from the interaction with this j atom. */
486             velecsum         = _mm_add_pd(velecsum,velec);
487
488             fscal            = felec;
489
490             /* Calculate temporary vectorial force */
491             tx               = _mm_mul_pd(fscal,dx21);
492             ty               = _mm_mul_pd(fscal,dy21);
493             tz               = _mm_mul_pd(fscal,dz21);
494
495             /* Update vectorial force */
496             fix2             = _mm_add_pd(fix2,tx);
497             fiy2             = _mm_add_pd(fiy2,ty);
498             fiz2             = _mm_add_pd(fiz2,tz);
499
500             fjx1             = _mm_add_pd(fjx1,tx);
501             fjy1             = _mm_add_pd(fjy1,ty);
502             fjz1             = _mm_add_pd(fjz1,tz);
503
504             /**************************
505              * CALCULATE INTERACTIONS *
506              **************************/
507
508             /* COULOMB ELECTROSTATICS */
509             velec            = _mm_mul_pd(qq22,rinv22);
510             felec            = _mm_mul_pd(velec,rinvsq22);
511
512             /* Update potential sum for this i atom from the interaction with this j atom. */
513             velecsum         = _mm_add_pd(velecsum,velec);
514
515             fscal            = felec;
516
517             /* Calculate temporary vectorial force */
518             tx               = _mm_mul_pd(fscal,dx22);
519             ty               = _mm_mul_pd(fscal,dy22);
520             tz               = _mm_mul_pd(fscal,dz22);
521
522             /* Update vectorial force */
523             fix2             = _mm_add_pd(fix2,tx);
524             fiy2             = _mm_add_pd(fiy2,ty);
525             fiz2             = _mm_add_pd(fiz2,tz);
526
527             fjx2             = _mm_add_pd(fjx2,tx);
528             fjy2             = _mm_add_pd(fjy2,ty);
529             fjz2             = _mm_add_pd(fjz2,tz);
530
531             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
532
533             /* Inner loop uses 264 flops */
534         }
535
536         if(jidx<j_index_end)
537         {
538
539             jnrA             = jjnr[jidx];
540             j_coord_offsetA  = DIM*jnrA;
541
542             /* load j atom coordinates */
543             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
544                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
545
546             /* Calculate displacement vector */
547             dx00             = _mm_sub_pd(ix0,jx0);
548             dy00             = _mm_sub_pd(iy0,jy0);
549             dz00             = _mm_sub_pd(iz0,jz0);
550             dx01             = _mm_sub_pd(ix0,jx1);
551             dy01             = _mm_sub_pd(iy0,jy1);
552             dz01             = _mm_sub_pd(iz0,jz1);
553             dx02             = _mm_sub_pd(ix0,jx2);
554             dy02             = _mm_sub_pd(iy0,jy2);
555             dz02             = _mm_sub_pd(iz0,jz2);
556             dx10             = _mm_sub_pd(ix1,jx0);
557             dy10             = _mm_sub_pd(iy1,jy0);
558             dz10             = _mm_sub_pd(iz1,jz0);
559             dx11             = _mm_sub_pd(ix1,jx1);
560             dy11             = _mm_sub_pd(iy1,jy1);
561             dz11             = _mm_sub_pd(iz1,jz1);
562             dx12             = _mm_sub_pd(ix1,jx2);
563             dy12             = _mm_sub_pd(iy1,jy2);
564             dz12             = _mm_sub_pd(iz1,jz2);
565             dx20             = _mm_sub_pd(ix2,jx0);
566             dy20             = _mm_sub_pd(iy2,jy0);
567             dz20             = _mm_sub_pd(iz2,jz0);
568             dx21             = _mm_sub_pd(ix2,jx1);
569             dy21             = _mm_sub_pd(iy2,jy1);
570             dz21             = _mm_sub_pd(iz2,jz1);
571             dx22             = _mm_sub_pd(ix2,jx2);
572             dy22             = _mm_sub_pd(iy2,jy2);
573             dz22             = _mm_sub_pd(iz2,jz2);
574
575             /* Calculate squared distance and things based on it */
576             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
577             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
578             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
579             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
580             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
581             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
582             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
583             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
584             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
585
586             rinv00           = gmx_mm_invsqrt_pd(rsq00);
587             rinv01           = gmx_mm_invsqrt_pd(rsq01);
588             rinv02           = gmx_mm_invsqrt_pd(rsq02);
589             rinv10           = gmx_mm_invsqrt_pd(rsq10);
590             rinv11           = gmx_mm_invsqrt_pd(rsq11);
591             rinv12           = gmx_mm_invsqrt_pd(rsq12);
592             rinv20           = gmx_mm_invsqrt_pd(rsq20);
593             rinv21           = gmx_mm_invsqrt_pd(rsq21);
594             rinv22           = gmx_mm_invsqrt_pd(rsq22);
595
596             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
597             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
598             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
599             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
600             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
601             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
602             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
603             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
604             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
605
606             fjx0             = _mm_setzero_pd();
607             fjy0             = _mm_setzero_pd();
608             fjz0             = _mm_setzero_pd();
609             fjx1             = _mm_setzero_pd();
610             fjy1             = _mm_setzero_pd();
611             fjz1             = _mm_setzero_pd();
612             fjx2             = _mm_setzero_pd();
613             fjy2             = _mm_setzero_pd();
614             fjz2             = _mm_setzero_pd();
615
616             /**************************
617              * CALCULATE INTERACTIONS *
618              **************************/
619
620             /* COULOMB ELECTROSTATICS */
621             velec            = _mm_mul_pd(qq00,rinv00);
622             felec            = _mm_mul_pd(velec,rinvsq00);
623
624             /* LENNARD-JONES DISPERSION/REPULSION */
625
626             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
627             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
628             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
629             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
630             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
631
632             /* Update potential sum for this i atom from the interaction with this j atom. */
633             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
634             velecsum         = _mm_add_pd(velecsum,velec);
635             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
636             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
637
638             fscal            = _mm_add_pd(felec,fvdw);
639
640             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
641
642             /* Calculate temporary vectorial force */
643             tx               = _mm_mul_pd(fscal,dx00);
644             ty               = _mm_mul_pd(fscal,dy00);
645             tz               = _mm_mul_pd(fscal,dz00);
646
647             /* Update vectorial force */
648             fix0             = _mm_add_pd(fix0,tx);
649             fiy0             = _mm_add_pd(fiy0,ty);
650             fiz0             = _mm_add_pd(fiz0,tz);
651
652             fjx0             = _mm_add_pd(fjx0,tx);
653             fjy0             = _mm_add_pd(fjy0,ty);
654             fjz0             = _mm_add_pd(fjz0,tz);
655
656             /**************************
657              * CALCULATE INTERACTIONS *
658              **************************/
659
660             /* COULOMB ELECTROSTATICS */
661             velec            = _mm_mul_pd(qq01,rinv01);
662             felec            = _mm_mul_pd(velec,rinvsq01);
663
664             /* Update potential sum for this i atom from the interaction with this j atom. */
665             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
666             velecsum         = _mm_add_pd(velecsum,velec);
667
668             fscal            = felec;
669
670             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
671
672             /* Calculate temporary vectorial force */
673             tx               = _mm_mul_pd(fscal,dx01);
674             ty               = _mm_mul_pd(fscal,dy01);
675             tz               = _mm_mul_pd(fscal,dz01);
676
677             /* Update vectorial force */
678             fix0             = _mm_add_pd(fix0,tx);
679             fiy0             = _mm_add_pd(fiy0,ty);
680             fiz0             = _mm_add_pd(fiz0,tz);
681
682             fjx1             = _mm_add_pd(fjx1,tx);
683             fjy1             = _mm_add_pd(fjy1,ty);
684             fjz1             = _mm_add_pd(fjz1,tz);
685
686             /**************************
687              * CALCULATE INTERACTIONS *
688              **************************/
689
690             /* COULOMB ELECTROSTATICS */
691             velec            = _mm_mul_pd(qq02,rinv02);
692             felec            = _mm_mul_pd(velec,rinvsq02);
693
694             /* Update potential sum for this i atom from the interaction with this j atom. */
695             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
696             velecsum         = _mm_add_pd(velecsum,velec);
697
698             fscal            = felec;
699
700             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
701
702             /* Calculate temporary vectorial force */
703             tx               = _mm_mul_pd(fscal,dx02);
704             ty               = _mm_mul_pd(fscal,dy02);
705             tz               = _mm_mul_pd(fscal,dz02);
706
707             /* Update vectorial force */
708             fix0             = _mm_add_pd(fix0,tx);
709             fiy0             = _mm_add_pd(fiy0,ty);
710             fiz0             = _mm_add_pd(fiz0,tz);
711
712             fjx2             = _mm_add_pd(fjx2,tx);
713             fjy2             = _mm_add_pd(fjy2,ty);
714             fjz2             = _mm_add_pd(fjz2,tz);
715
716             /**************************
717              * CALCULATE INTERACTIONS *
718              **************************/
719
720             /* COULOMB ELECTROSTATICS */
721             velec            = _mm_mul_pd(qq10,rinv10);
722             felec            = _mm_mul_pd(velec,rinvsq10);
723
724             /* Update potential sum for this i atom from the interaction with this j atom. */
725             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
726             velecsum         = _mm_add_pd(velecsum,velec);
727
728             fscal            = felec;
729
730             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
731
732             /* Calculate temporary vectorial force */
733             tx               = _mm_mul_pd(fscal,dx10);
734             ty               = _mm_mul_pd(fscal,dy10);
735             tz               = _mm_mul_pd(fscal,dz10);
736
737             /* Update vectorial force */
738             fix1             = _mm_add_pd(fix1,tx);
739             fiy1             = _mm_add_pd(fiy1,ty);
740             fiz1             = _mm_add_pd(fiz1,tz);
741
742             fjx0             = _mm_add_pd(fjx0,tx);
743             fjy0             = _mm_add_pd(fjy0,ty);
744             fjz0             = _mm_add_pd(fjz0,tz);
745
746             /**************************
747              * CALCULATE INTERACTIONS *
748              **************************/
749
750             /* COULOMB ELECTROSTATICS */
751             velec            = _mm_mul_pd(qq11,rinv11);
752             felec            = _mm_mul_pd(velec,rinvsq11);
753
754             /* Update potential sum for this i atom from the interaction with this j atom. */
755             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
756             velecsum         = _mm_add_pd(velecsum,velec);
757
758             fscal            = felec;
759
760             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
761
762             /* Calculate temporary vectorial force */
763             tx               = _mm_mul_pd(fscal,dx11);
764             ty               = _mm_mul_pd(fscal,dy11);
765             tz               = _mm_mul_pd(fscal,dz11);
766
767             /* Update vectorial force */
768             fix1             = _mm_add_pd(fix1,tx);
769             fiy1             = _mm_add_pd(fiy1,ty);
770             fiz1             = _mm_add_pd(fiz1,tz);
771
772             fjx1             = _mm_add_pd(fjx1,tx);
773             fjy1             = _mm_add_pd(fjy1,ty);
774             fjz1             = _mm_add_pd(fjz1,tz);
775
776             /**************************
777              * CALCULATE INTERACTIONS *
778              **************************/
779
780             /* COULOMB ELECTROSTATICS */
781             velec            = _mm_mul_pd(qq12,rinv12);
782             felec            = _mm_mul_pd(velec,rinvsq12);
783
784             /* Update potential sum for this i atom from the interaction with this j atom. */
785             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
786             velecsum         = _mm_add_pd(velecsum,velec);
787
788             fscal            = felec;
789
790             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
791
792             /* Calculate temporary vectorial force */
793             tx               = _mm_mul_pd(fscal,dx12);
794             ty               = _mm_mul_pd(fscal,dy12);
795             tz               = _mm_mul_pd(fscal,dz12);
796
797             /* Update vectorial force */
798             fix1             = _mm_add_pd(fix1,tx);
799             fiy1             = _mm_add_pd(fiy1,ty);
800             fiz1             = _mm_add_pd(fiz1,tz);
801
802             fjx2             = _mm_add_pd(fjx2,tx);
803             fjy2             = _mm_add_pd(fjy2,ty);
804             fjz2             = _mm_add_pd(fjz2,tz);
805
806             /**************************
807              * CALCULATE INTERACTIONS *
808              **************************/
809
810             /* COULOMB ELECTROSTATICS */
811             velec            = _mm_mul_pd(qq20,rinv20);
812             felec            = _mm_mul_pd(velec,rinvsq20);
813
814             /* Update potential sum for this i atom from the interaction with this j atom. */
815             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
816             velecsum         = _mm_add_pd(velecsum,velec);
817
818             fscal            = felec;
819
820             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
821
822             /* Calculate temporary vectorial force */
823             tx               = _mm_mul_pd(fscal,dx20);
824             ty               = _mm_mul_pd(fscal,dy20);
825             tz               = _mm_mul_pd(fscal,dz20);
826
827             /* Update vectorial force */
828             fix2             = _mm_add_pd(fix2,tx);
829             fiy2             = _mm_add_pd(fiy2,ty);
830             fiz2             = _mm_add_pd(fiz2,tz);
831
832             fjx0             = _mm_add_pd(fjx0,tx);
833             fjy0             = _mm_add_pd(fjy0,ty);
834             fjz0             = _mm_add_pd(fjz0,tz);
835
836             /**************************
837              * CALCULATE INTERACTIONS *
838              **************************/
839
840             /* COULOMB ELECTROSTATICS */
841             velec            = _mm_mul_pd(qq21,rinv21);
842             felec            = _mm_mul_pd(velec,rinvsq21);
843
844             /* Update potential sum for this i atom from the interaction with this j atom. */
845             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
846             velecsum         = _mm_add_pd(velecsum,velec);
847
848             fscal            = felec;
849
850             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
851
852             /* Calculate temporary vectorial force */
853             tx               = _mm_mul_pd(fscal,dx21);
854             ty               = _mm_mul_pd(fscal,dy21);
855             tz               = _mm_mul_pd(fscal,dz21);
856
857             /* Update vectorial force */
858             fix2             = _mm_add_pd(fix2,tx);
859             fiy2             = _mm_add_pd(fiy2,ty);
860             fiz2             = _mm_add_pd(fiz2,tz);
861
862             fjx1             = _mm_add_pd(fjx1,tx);
863             fjy1             = _mm_add_pd(fjy1,ty);
864             fjz1             = _mm_add_pd(fjz1,tz);
865
866             /**************************
867              * CALCULATE INTERACTIONS *
868              **************************/
869
870             /* COULOMB ELECTROSTATICS */
871             velec            = _mm_mul_pd(qq22,rinv22);
872             felec            = _mm_mul_pd(velec,rinvsq22);
873
874             /* Update potential sum for this i atom from the interaction with this j atom. */
875             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
876             velecsum         = _mm_add_pd(velecsum,velec);
877
878             fscal            = felec;
879
880             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
881
882             /* Calculate temporary vectorial force */
883             tx               = _mm_mul_pd(fscal,dx22);
884             ty               = _mm_mul_pd(fscal,dy22);
885             tz               = _mm_mul_pd(fscal,dz22);
886
887             /* Update vectorial force */
888             fix2             = _mm_add_pd(fix2,tx);
889             fiy2             = _mm_add_pd(fiy2,ty);
890             fiz2             = _mm_add_pd(fiz2,tz);
891
892             fjx2             = _mm_add_pd(fjx2,tx);
893             fjy2             = _mm_add_pd(fjy2,ty);
894             fjz2             = _mm_add_pd(fjz2,tz);
895
896             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
897
898             /* Inner loop uses 264 flops */
899         }
900
901         /* End of innermost loop */
902
903         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
904                                               f+i_coord_offset,fshift+i_shift_offset);
905
906         ggid                        = gid[iidx];
907         /* Update potential energies */
908         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
909         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
910
911         /* Increment number of inner iterations */
912         inneriter                  += j_index_end - j_index_start;
913
914         /* Outer loop uses 20 flops */
915     }
916
917     /* Increment number of outer iterations */
918     outeriter        += nri;
919
920     /* Update outer/inner flops */
921
922     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*264);
923 }
924 /*
925  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_double
926  * Electrostatics interaction: Coulomb
927  * VdW interaction:            LennardJones
928  * Geometry:                   Water3-Water3
929  * Calculate force/pot:        Force
930  */
931 void
932 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_double
933                     (t_nblist                    * gmx_restrict       nlist,
934                      rvec                        * gmx_restrict          xx,
935                      rvec                        * gmx_restrict          ff,
936                      t_forcerec                  * gmx_restrict          fr,
937                      t_mdatoms                   * gmx_restrict     mdatoms,
938                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
939                      t_nrnb                      * gmx_restrict        nrnb)
940 {
941     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
942      * just 0 for non-waters.
943      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
944      * jnr indices corresponding to data put in the four positions in the SIMD register.
945      */
946     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
947     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
948     int              jnrA,jnrB;
949     int              j_coord_offsetA,j_coord_offsetB;
950     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
951     real             rcutoff_scalar;
952     real             *shiftvec,*fshift,*x,*f;
953     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
954     int              vdwioffset0;
955     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
956     int              vdwioffset1;
957     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
958     int              vdwioffset2;
959     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
960     int              vdwjidx0A,vdwjidx0B;
961     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
962     int              vdwjidx1A,vdwjidx1B;
963     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
964     int              vdwjidx2A,vdwjidx2B;
965     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
966     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
967     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
968     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
969     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
970     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
971     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
972     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
973     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
974     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
975     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
976     real             *charge;
977     int              nvdwtype;
978     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
979     int              *vdwtype;
980     real             *vdwparam;
981     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
982     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
983     __m128d          dummy_mask,cutoff_mask;
984     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
985     __m128d          one     = _mm_set1_pd(1.0);
986     __m128d          two     = _mm_set1_pd(2.0);
987     x                = xx[0];
988     f                = ff[0];
989
990     nri              = nlist->nri;
991     iinr             = nlist->iinr;
992     jindex           = nlist->jindex;
993     jjnr             = nlist->jjnr;
994     shiftidx         = nlist->shift;
995     gid              = nlist->gid;
996     shiftvec         = fr->shift_vec[0];
997     fshift           = fr->fshift[0];
998     facel            = _mm_set1_pd(fr->epsfac);
999     charge           = mdatoms->chargeA;
1000     nvdwtype         = fr->ntype;
1001     vdwparam         = fr->nbfp;
1002     vdwtype          = mdatoms->typeA;
1003
1004     /* Setup water-specific parameters */
1005     inr              = nlist->iinr[0];
1006     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1007     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1008     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1009     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1010
1011     jq0              = _mm_set1_pd(charge[inr+0]);
1012     jq1              = _mm_set1_pd(charge[inr+1]);
1013     jq2              = _mm_set1_pd(charge[inr+2]);
1014     vdwjidx0A        = 2*vdwtype[inr+0];
1015     qq00             = _mm_mul_pd(iq0,jq0);
1016     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1017     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1018     qq01             = _mm_mul_pd(iq0,jq1);
1019     qq02             = _mm_mul_pd(iq0,jq2);
1020     qq10             = _mm_mul_pd(iq1,jq0);
1021     qq11             = _mm_mul_pd(iq1,jq1);
1022     qq12             = _mm_mul_pd(iq1,jq2);
1023     qq20             = _mm_mul_pd(iq2,jq0);
1024     qq21             = _mm_mul_pd(iq2,jq1);
1025     qq22             = _mm_mul_pd(iq2,jq2);
1026
1027     /* Avoid stupid compiler warnings */
1028     jnrA = jnrB = 0;
1029     j_coord_offsetA = 0;
1030     j_coord_offsetB = 0;
1031
1032     outeriter        = 0;
1033     inneriter        = 0;
1034
1035     /* Start outer loop over neighborlists */
1036     for(iidx=0; iidx<nri; iidx++)
1037     {
1038         /* Load shift vector for this list */
1039         i_shift_offset   = DIM*shiftidx[iidx];
1040
1041         /* Load limits for loop over neighbors */
1042         j_index_start    = jindex[iidx];
1043         j_index_end      = jindex[iidx+1];
1044
1045         /* Get outer coordinate index */
1046         inr              = iinr[iidx];
1047         i_coord_offset   = DIM*inr;
1048
1049         /* Load i particle coords and add shift vector */
1050         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1051                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1052
1053         fix0             = _mm_setzero_pd();
1054         fiy0             = _mm_setzero_pd();
1055         fiz0             = _mm_setzero_pd();
1056         fix1             = _mm_setzero_pd();
1057         fiy1             = _mm_setzero_pd();
1058         fiz1             = _mm_setzero_pd();
1059         fix2             = _mm_setzero_pd();
1060         fiy2             = _mm_setzero_pd();
1061         fiz2             = _mm_setzero_pd();
1062
1063         /* Start inner kernel loop */
1064         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1065         {
1066
1067             /* Get j neighbor index, and coordinate index */
1068             jnrA             = jjnr[jidx];
1069             jnrB             = jjnr[jidx+1];
1070             j_coord_offsetA  = DIM*jnrA;
1071             j_coord_offsetB  = DIM*jnrB;
1072
1073             /* load j atom coordinates */
1074             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1075                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1076
1077             /* Calculate displacement vector */
1078             dx00             = _mm_sub_pd(ix0,jx0);
1079             dy00             = _mm_sub_pd(iy0,jy0);
1080             dz00             = _mm_sub_pd(iz0,jz0);
1081             dx01             = _mm_sub_pd(ix0,jx1);
1082             dy01             = _mm_sub_pd(iy0,jy1);
1083             dz01             = _mm_sub_pd(iz0,jz1);
1084             dx02             = _mm_sub_pd(ix0,jx2);
1085             dy02             = _mm_sub_pd(iy0,jy2);
1086             dz02             = _mm_sub_pd(iz0,jz2);
1087             dx10             = _mm_sub_pd(ix1,jx0);
1088             dy10             = _mm_sub_pd(iy1,jy0);
1089             dz10             = _mm_sub_pd(iz1,jz0);
1090             dx11             = _mm_sub_pd(ix1,jx1);
1091             dy11             = _mm_sub_pd(iy1,jy1);
1092             dz11             = _mm_sub_pd(iz1,jz1);
1093             dx12             = _mm_sub_pd(ix1,jx2);
1094             dy12             = _mm_sub_pd(iy1,jy2);
1095             dz12             = _mm_sub_pd(iz1,jz2);
1096             dx20             = _mm_sub_pd(ix2,jx0);
1097             dy20             = _mm_sub_pd(iy2,jy0);
1098             dz20             = _mm_sub_pd(iz2,jz0);
1099             dx21             = _mm_sub_pd(ix2,jx1);
1100             dy21             = _mm_sub_pd(iy2,jy1);
1101             dz21             = _mm_sub_pd(iz2,jz1);
1102             dx22             = _mm_sub_pd(ix2,jx2);
1103             dy22             = _mm_sub_pd(iy2,jy2);
1104             dz22             = _mm_sub_pd(iz2,jz2);
1105
1106             /* Calculate squared distance and things based on it */
1107             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1108             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1109             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1110             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1111             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1112             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1113             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1114             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1115             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1116
1117             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1118             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1119             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1120             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1121             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1122             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1123             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1124             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1125             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1126
1127             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1128             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1129             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1130             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1131             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1132             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1133             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1134             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1135             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1136
1137             fjx0             = _mm_setzero_pd();
1138             fjy0             = _mm_setzero_pd();
1139             fjz0             = _mm_setzero_pd();
1140             fjx1             = _mm_setzero_pd();
1141             fjy1             = _mm_setzero_pd();
1142             fjz1             = _mm_setzero_pd();
1143             fjx2             = _mm_setzero_pd();
1144             fjy2             = _mm_setzero_pd();
1145             fjz2             = _mm_setzero_pd();
1146
1147             /**************************
1148              * CALCULATE INTERACTIONS *
1149              **************************/
1150
1151             /* COULOMB ELECTROSTATICS */
1152             velec            = _mm_mul_pd(qq00,rinv00);
1153             felec            = _mm_mul_pd(velec,rinvsq00);
1154
1155             /* LENNARD-JONES DISPERSION/REPULSION */
1156
1157             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1158             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1159
1160             fscal            = _mm_add_pd(felec,fvdw);
1161
1162             /* Calculate temporary vectorial force */
1163             tx               = _mm_mul_pd(fscal,dx00);
1164             ty               = _mm_mul_pd(fscal,dy00);
1165             tz               = _mm_mul_pd(fscal,dz00);
1166
1167             /* Update vectorial force */
1168             fix0             = _mm_add_pd(fix0,tx);
1169             fiy0             = _mm_add_pd(fiy0,ty);
1170             fiz0             = _mm_add_pd(fiz0,tz);
1171
1172             fjx0             = _mm_add_pd(fjx0,tx);
1173             fjy0             = _mm_add_pd(fjy0,ty);
1174             fjz0             = _mm_add_pd(fjz0,tz);
1175
1176             /**************************
1177              * CALCULATE INTERACTIONS *
1178              **************************/
1179
1180             /* COULOMB ELECTROSTATICS */
1181             velec            = _mm_mul_pd(qq01,rinv01);
1182             felec            = _mm_mul_pd(velec,rinvsq01);
1183
1184             fscal            = felec;
1185
1186             /* Calculate temporary vectorial force */
1187             tx               = _mm_mul_pd(fscal,dx01);
1188             ty               = _mm_mul_pd(fscal,dy01);
1189             tz               = _mm_mul_pd(fscal,dz01);
1190
1191             /* Update vectorial force */
1192             fix0             = _mm_add_pd(fix0,tx);
1193             fiy0             = _mm_add_pd(fiy0,ty);
1194             fiz0             = _mm_add_pd(fiz0,tz);
1195
1196             fjx1             = _mm_add_pd(fjx1,tx);
1197             fjy1             = _mm_add_pd(fjy1,ty);
1198             fjz1             = _mm_add_pd(fjz1,tz);
1199
1200             /**************************
1201              * CALCULATE INTERACTIONS *
1202              **************************/
1203
1204             /* COULOMB ELECTROSTATICS */
1205             velec            = _mm_mul_pd(qq02,rinv02);
1206             felec            = _mm_mul_pd(velec,rinvsq02);
1207
1208             fscal            = felec;
1209
1210             /* Calculate temporary vectorial force */
1211             tx               = _mm_mul_pd(fscal,dx02);
1212             ty               = _mm_mul_pd(fscal,dy02);
1213             tz               = _mm_mul_pd(fscal,dz02);
1214
1215             /* Update vectorial force */
1216             fix0             = _mm_add_pd(fix0,tx);
1217             fiy0             = _mm_add_pd(fiy0,ty);
1218             fiz0             = _mm_add_pd(fiz0,tz);
1219
1220             fjx2             = _mm_add_pd(fjx2,tx);
1221             fjy2             = _mm_add_pd(fjy2,ty);
1222             fjz2             = _mm_add_pd(fjz2,tz);
1223
1224             /**************************
1225              * CALCULATE INTERACTIONS *
1226              **************************/
1227
1228             /* COULOMB ELECTROSTATICS */
1229             velec            = _mm_mul_pd(qq10,rinv10);
1230             felec            = _mm_mul_pd(velec,rinvsq10);
1231
1232             fscal            = felec;
1233
1234             /* Calculate temporary vectorial force */
1235             tx               = _mm_mul_pd(fscal,dx10);
1236             ty               = _mm_mul_pd(fscal,dy10);
1237             tz               = _mm_mul_pd(fscal,dz10);
1238
1239             /* Update vectorial force */
1240             fix1             = _mm_add_pd(fix1,tx);
1241             fiy1             = _mm_add_pd(fiy1,ty);
1242             fiz1             = _mm_add_pd(fiz1,tz);
1243
1244             fjx0             = _mm_add_pd(fjx0,tx);
1245             fjy0             = _mm_add_pd(fjy0,ty);
1246             fjz0             = _mm_add_pd(fjz0,tz);
1247
1248             /**************************
1249              * CALCULATE INTERACTIONS *
1250              **************************/
1251
1252             /* COULOMB ELECTROSTATICS */
1253             velec            = _mm_mul_pd(qq11,rinv11);
1254             felec            = _mm_mul_pd(velec,rinvsq11);
1255
1256             fscal            = felec;
1257
1258             /* Calculate temporary vectorial force */
1259             tx               = _mm_mul_pd(fscal,dx11);
1260             ty               = _mm_mul_pd(fscal,dy11);
1261             tz               = _mm_mul_pd(fscal,dz11);
1262
1263             /* Update vectorial force */
1264             fix1             = _mm_add_pd(fix1,tx);
1265             fiy1             = _mm_add_pd(fiy1,ty);
1266             fiz1             = _mm_add_pd(fiz1,tz);
1267
1268             fjx1             = _mm_add_pd(fjx1,tx);
1269             fjy1             = _mm_add_pd(fjy1,ty);
1270             fjz1             = _mm_add_pd(fjz1,tz);
1271
1272             /**************************
1273              * CALCULATE INTERACTIONS *
1274              **************************/
1275
1276             /* COULOMB ELECTROSTATICS */
1277             velec            = _mm_mul_pd(qq12,rinv12);
1278             felec            = _mm_mul_pd(velec,rinvsq12);
1279
1280             fscal            = felec;
1281
1282             /* Calculate temporary vectorial force */
1283             tx               = _mm_mul_pd(fscal,dx12);
1284             ty               = _mm_mul_pd(fscal,dy12);
1285             tz               = _mm_mul_pd(fscal,dz12);
1286
1287             /* Update vectorial force */
1288             fix1             = _mm_add_pd(fix1,tx);
1289             fiy1             = _mm_add_pd(fiy1,ty);
1290             fiz1             = _mm_add_pd(fiz1,tz);
1291
1292             fjx2             = _mm_add_pd(fjx2,tx);
1293             fjy2             = _mm_add_pd(fjy2,ty);
1294             fjz2             = _mm_add_pd(fjz2,tz);
1295
1296             /**************************
1297              * CALCULATE INTERACTIONS *
1298              **************************/
1299
1300             /* COULOMB ELECTROSTATICS */
1301             velec            = _mm_mul_pd(qq20,rinv20);
1302             felec            = _mm_mul_pd(velec,rinvsq20);
1303
1304             fscal            = felec;
1305
1306             /* Calculate temporary vectorial force */
1307             tx               = _mm_mul_pd(fscal,dx20);
1308             ty               = _mm_mul_pd(fscal,dy20);
1309             tz               = _mm_mul_pd(fscal,dz20);
1310
1311             /* Update vectorial force */
1312             fix2             = _mm_add_pd(fix2,tx);
1313             fiy2             = _mm_add_pd(fiy2,ty);
1314             fiz2             = _mm_add_pd(fiz2,tz);
1315
1316             fjx0             = _mm_add_pd(fjx0,tx);
1317             fjy0             = _mm_add_pd(fjy0,ty);
1318             fjz0             = _mm_add_pd(fjz0,tz);
1319
1320             /**************************
1321              * CALCULATE INTERACTIONS *
1322              **************************/
1323
1324             /* COULOMB ELECTROSTATICS */
1325             velec            = _mm_mul_pd(qq21,rinv21);
1326             felec            = _mm_mul_pd(velec,rinvsq21);
1327
1328             fscal            = felec;
1329
1330             /* Calculate temporary vectorial force */
1331             tx               = _mm_mul_pd(fscal,dx21);
1332             ty               = _mm_mul_pd(fscal,dy21);
1333             tz               = _mm_mul_pd(fscal,dz21);
1334
1335             /* Update vectorial force */
1336             fix2             = _mm_add_pd(fix2,tx);
1337             fiy2             = _mm_add_pd(fiy2,ty);
1338             fiz2             = _mm_add_pd(fiz2,tz);
1339
1340             fjx1             = _mm_add_pd(fjx1,tx);
1341             fjy1             = _mm_add_pd(fjy1,ty);
1342             fjz1             = _mm_add_pd(fjz1,tz);
1343
1344             /**************************
1345              * CALCULATE INTERACTIONS *
1346              **************************/
1347
1348             /* COULOMB ELECTROSTATICS */
1349             velec            = _mm_mul_pd(qq22,rinv22);
1350             felec            = _mm_mul_pd(velec,rinvsq22);
1351
1352             fscal            = felec;
1353
1354             /* Calculate temporary vectorial force */
1355             tx               = _mm_mul_pd(fscal,dx22);
1356             ty               = _mm_mul_pd(fscal,dy22);
1357             tz               = _mm_mul_pd(fscal,dz22);
1358
1359             /* Update vectorial force */
1360             fix2             = _mm_add_pd(fix2,tx);
1361             fiy2             = _mm_add_pd(fiy2,ty);
1362             fiz2             = _mm_add_pd(fiz2,tz);
1363
1364             fjx2             = _mm_add_pd(fjx2,tx);
1365             fjy2             = _mm_add_pd(fjy2,ty);
1366             fjz2             = _mm_add_pd(fjz2,tz);
1367
1368             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1369
1370             /* Inner loop uses 250 flops */
1371         }
1372
1373         if(jidx<j_index_end)
1374         {
1375
1376             jnrA             = jjnr[jidx];
1377             j_coord_offsetA  = DIM*jnrA;
1378
1379             /* load j atom coordinates */
1380             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1381                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1382
1383             /* Calculate displacement vector */
1384             dx00             = _mm_sub_pd(ix0,jx0);
1385             dy00             = _mm_sub_pd(iy0,jy0);
1386             dz00             = _mm_sub_pd(iz0,jz0);
1387             dx01             = _mm_sub_pd(ix0,jx1);
1388             dy01             = _mm_sub_pd(iy0,jy1);
1389             dz01             = _mm_sub_pd(iz0,jz1);
1390             dx02             = _mm_sub_pd(ix0,jx2);
1391             dy02             = _mm_sub_pd(iy0,jy2);
1392             dz02             = _mm_sub_pd(iz0,jz2);
1393             dx10             = _mm_sub_pd(ix1,jx0);
1394             dy10             = _mm_sub_pd(iy1,jy0);
1395             dz10             = _mm_sub_pd(iz1,jz0);
1396             dx11             = _mm_sub_pd(ix1,jx1);
1397             dy11             = _mm_sub_pd(iy1,jy1);
1398             dz11             = _mm_sub_pd(iz1,jz1);
1399             dx12             = _mm_sub_pd(ix1,jx2);
1400             dy12             = _mm_sub_pd(iy1,jy2);
1401             dz12             = _mm_sub_pd(iz1,jz2);
1402             dx20             = _mm_sub_pd(ix2,jx0);
1403             dy20             = _mm_sub_pd(iy2,jy0);
1404             dz20             = _mm_sub_pd(iz2,jz0);
1405             dx21             = _mm_sub_pd(ix2,jx1);
1406             dy21             = _mm_sub_pd(iy2,jy1);
1407             dz21             = _mm_sub_pd(iz2,jz1);
1408             dx22             = _mm_sub_pd(ix2,jx2);
1409             dy22             = _mm_sub_pd(iy2,jy2);
1410             dz22             = _mm_sub_pd(iz2,jz2);
1411
1412             /* Calculate squared distance and things based on it */
1413             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1414             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1415             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1416             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1417             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1418             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1419             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1420             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1421             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1422
1423             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1424             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1425             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1426             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1427             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1428             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1429             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1430             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1431             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1432
1433             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1434             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1435             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1436             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1437             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1438             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1439             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1440             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1441             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1442
1443             fjx0             = _mm_setzero_pd();
1444             fjy0             = _mm_setzero_pd();
1445             fjz0             = _mm_setzero_pd();
1446             fjx1             = _mm_setzero_pd();
1447             fjy1             = _mm_setzero_pd();
1448             fjz1             = _mm_setzero_pd();
1449             fjx2             = _mm_setzero_pd();
1450             fjy2             = _mm_setzero_pd();
1451             fjz2             = _mm_setzero_pd();
1452
1453             /**************************
1454              * CALCULATE INTERACTIONS *
1455              **************************/
1456
1457             /* COULOMB ELECTROSTATICS */
1458             velec            = _mm_mul_pd(qq00,rinv00);
1459             felec            = _mm_mul_pd(velec,rinvsq00);
1460
1461             /* LENNARD-JONES DISPERSION/REPULSION */
1462
1463             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1464             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1465
1466             fscal            = _mm_add_pd(felec,fvdw);
1467
1468             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1469
1470             /* Calculate temporary vectorial force */
1471             tx               = _mm_mul_pd(fscal,dx00);
1472             ty               = _mm_mul_pd(fscal,dy00);
1473             tz               = _mm_mul_pd(fscal,dz00);
1474
1475             /* Update vectorial force */
1476             fix0             = _mm_add_pd(fix0,tx);
1477             fiy0             = _mm_add_pd(fiy0,ty);
1478             fiz0             = _mm_add_pd(fiz0,tz);
1479
1480             fjx0             = _mm_add_pd(fjx0,tx);
1481             fjy0             = _mm_add_pd(fjy0,ty);
1482             fjz0             = _mm_add_pd(fjz0,tz);
1483
1484             /**************************
1485              * CALCULATE INTERACTIONS *
1486              **************************/
1487
1488             /* COULOMB ELECTROSTATICS */
1489             velec            = _mm_mul_pd(qq01,rinv01);
1490             felec            = _mm_mul_pd(velec,rinvsq01);
1491
1492             fscal            = felec;
1493
1494             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1495
1496             /* Calculate temporary vectorial force */
1497             tx               = _mm_mul_pd(fscal,dx01);
1498             ty               = _mm_mul_pd(fscal,dy01);
1499             tz               = _mm_mul_pd(fscal,dz01);
1500
1501             /* Update vectorial force */
1502             fix0             = _mm_add_pd(fix0,tx);
1503             fiy0             = _mm_add_pd(fiy0,ty);
1504             fiz0             = _mm_add_pd(fiz0,tz);
1505
1506             fjx1             = _mm_add_pd(fjx1,tx);
1507             fjy1             = _mm_add_pd(fjy1,ty);
1508             fjz1             = _mm_add_pd(fjz1,tz);
1509
1510             /**************************
1511              * CALCULATE INTERACTIONS *
1512              **************************/
1513
1514             /* COULOMB ELECTROSTATICS */
1515             velec            = _mm_mul_pd(qq02,rinv02);
1516             felec            = _mm_mul_pd(velec,rinvsq02);
1517
1518             fscal            = felec;
1519
1520             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1521
1522             /* Calculate temporary vectorial force */
1523             tx               = _mm_mul_pd(fscal,dx02);
1524             ty               = _mm_mul_pd(fscal,dy02);
1525             tz               = _mm_mul_pd(fscal,dz02);
1526
1527             /* Update vectorial force */
1528             fix0             = _mm_add_pd(fix0,tx);
1529             fiy0             = _mm_add_pd(fiy0,ty);
1530             fiz0             = _mm_add_pd(fiz0,tz);
1531
1532             fjx2             = _mm_add_pd(fjx2,tx);
1533             fjy2             = _mm_add_pd(fjy2,ty);
1534             fjz2             = _mm_add_pd(fjz2,tz);
1535
1536             /**************************
1537              * CALCULATE INTERACTIONS *
1538              **************************/
1539
1540             /* COULOMB ELECTROSTATICS */
1541             velec            = _mm_mul_pd(qq10,rinv10);
1542             felec            = _mm_mul_pd(velec,rinvsq10);
1543
1544             fscal            = felec;
1545
1546             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1547
1548             /* Calculate temporary vectorial force */
1549             tx               = _mm_mul_pd(fscal,dx10);
1550             ty               = _mm_mul_pd(fscal,dy10);
1551             tz               = _mm_mul_pd(fscal,dz10);
1552
1553             /* Update vectorial force */
1554             fix1             = _mm_add_pd(fix1,tx);
1555             fiy1             = _mm_add_pd(fiy1,ty);
1556             fiz1             = _mm_add_pd(fiz1,tz);
1557
1558             fjx0             = _mm_add_pd(fjx0,tx);
1559             fjy0             = _mm_add_pd(fjy0,ty);
1560             fjz0             = _mm_add_pd(fjz0,tz);
1561
1562             /**************************
1563              * CALCULATE INTERACTIONS *
1564              **************************/
1565
1566             /* COULOMB ELECTROSTATICS */
1567             velec            = _mm_mul_pd(qq11,rinv11);
1568             felec            = _mm_mul_pd(velec,rinvsq11);
1569
1570             fscal            = felec;
1571
1572             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1573
1574             /* Calculate temporary vectorial force */
1575             tx               = _mm_mul_pd(fscal,dx11);
1576             ty               = _mm_mul_pd(fscal,dy11);
1577             tz               = _mm_mul_pd(fscal,dz11);
1578
1579             /* Update vectorial force */
1580             fix1             = _mm_add_pd(fix1,tx);
1581             fiy1             = _mm_add_pd(fiy1,ty);
1582             fiz1             = _mm_add_pd(fiz1,tz);
1583
1584             fjx1             = _mm_add_pd(fjx1,tx);
1585             fjy1             = _mm_add_pd(fjy1,ty);
1586             fjz1             = _mm_add_pd(fjz1,tz);
1587
1588             /**************************
1589              * CALCULATE INTERACTIONS *
1590              **************************/
1591
1592             /* COULOMB ELECTROSTATICS */
1593             velec            = _mm_mul_pd(qq12,rinv12);
1594             felec            = _mm_mul_pd(velec,rinvsq12);
1595
1596             fscal            = felec;
1597
1598             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1599
1600             /* Calculate temporary vectorial force */
1601             tx               = _mm_mul_pd(fscal,dx12);
1602             ty               = _mm_mul_pd(fscal,dy12);
1603             tz               = _mm_mul_pd(fscal,dz12);
1604
1605             /* Update vectorial force */
1606             fix1             = _mm_add_pd(fix1,tx);
1607             fiy1             = _mm_add_pd(fiy1,ty);
1608             fiz1             = _mm_add_pd(fiz1,tz);
1609
1610             fjx2             = _mm_add_pd(fjx2,tx);
1611             fjy2             = _mm_add_pd(fjy2,ty);
1612             fjz2             = _mm_add_pd(fjz2,tz);
1613
1614             /**************************
1615              * CALCULATE INTERACTIONS *
1616              **************************/
1617
1618             /* COULOMB ELECTROSTATICS */
1619             velec            = _mm_mul_pd(qq20,rinv20);
1620             felec            = _mm_mul_pd(velec,rinvsq20);
1621
1622             fscal            = felec;
1623
1624             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1625
1626             /* Calculate temporary vectorial force */
1627             tx               = _mm_mul_pd(fscal,dx20);
1628             ty               = _mm_mul_pd(fscal,dy20);
1629             tz               = _mm_mul_pd(fscal,dz20);
1630
1631             /* Update vectorial force */
1632             fix2             = _mm_add_pd(fix2,tx);
1633             fiy2             = _mm_add_pd(fiy2,ty);
1634             fiz2             = _mm_add_pd(fiz2,tz);
1635
1636             fjx0             = _mm_add_pd(fjx0,tx);
1637             fjy0             = _mm_add_pd(fjy0,ty);
1638             fjz0             = _mm_add_pd(fjz0,tz);
1639
1640             /**************************
1641              * CALCULATE INTERACTIONS *
1642              **************************/
1643
1644             /* COULOMB ELECTROSTATICS */
1645             velec            = _mm_mul_pd(qq21,rinv21);
1646             felec            = _mm_mul_pd(velec,rinvsq21);
1647
1648             fscal            = felec;
1649
1650             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1651
1652             /* Calculate temporary vectorial force */
1653             tx               = _mm_mul_pd(fscal,dx21);
1654             ty               = _mm_mul_pd(fscal,dy21);
1655             tz               = _mm_mul_pd(fscal,dz21);
1656
1657             /* Update vectorial force */
1658             fix2             = _mm_add_pd(fix2,tx);
1659             fiy2             = _mm_add_pd(fiy2,ty);
1660             fiz2             = _mm_add_pd(fiz2,tz);
1661
1662             fjx1             = _mm_add_pd(fjx1,tx);
1663             fjy1             = _mm_add_pd(fjy1,ty);
1664             fjz1             = _mm_add_pd(fjz1,tz);
1665
1666             /**************************
1667              * CALCULATE INTERACTIONS *
1668              **************************/
1669
1670             /* COULOMB ELECTROSTATICS */
1671             velec            = _mm_mul_pd(qq22,rinv22);
1672             felec            = _mm_mul_pd(velec,rinvsq22);
1673
1674             fscal            = felec;
1675
1676             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1677
1678             /* Calculate temporary vectorial force */
1679             tx               = _mm_mul_pd(fscal,dx22);
1680             ty               = _mm_mul_pd(fscal,dy22);
1681             tz               = _mm_mul_pd(fscal,dz22);
1682
1683             /* Update vectorial force */
1684             fix2             = _mm_add_pd(fix2,tx);
1685             fiy2             = _mm_add_pd(fiy2,ty);
1686             fiz2             = _mm_add_pd(fiz2,tz);
1687
1688             fjx2             = _mm_add_pd(fjx2,tx);
1689             fjy2             = _mm_add_pd(fjy2,ty);
1690             fjz2             = _mm_add_pd(fjz2,tz);
1691
1692             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1693
1694             /* Inner loop uses 250 flops */
1695         }
1696
1697         /* End of innermost loop */
1698
1699         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1700                                               f+i_coord_offset,fshift+i_shift_offset);
1701
1702         /* Increment number of inner iterations */
1703         inneriter                  += j_index_end - j_index_start;
1704
1705         /* Outer loop uses 18 flops */
1706     }
1707
1708     /* Increment number of outer iterations */
1709     outeriter        += nri;
1710
1711     /* Update outer/inner flops */
1712
1713     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);
1714 }