Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse2_double.h"
48 #include "kernelutil_x86_sse2_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double
52  * Electrostatics interaction: Coulomb
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB;
75     int              j_coord_offsetA,j_coord_offsetB;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80     int              vdwioffset0;
81     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82     int              vdwioffset1;
83     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwioffset3;
87     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     int              vdwjidx3A,vdwjidx3B;
95     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
113     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
114     __m128d          dummy_mask,cutoff_mask;
115     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
116     __m128d          one     = _mm_set1_pd(1.0);
117     __m128d          two     = _mm_set1_pd(2.0);
118     x                = xx[0];
119     f                = ff[0];
120
121     nri              = nlist->nri;
122     iinr             = nlist->iinr;
123     jindex           = nlist->jindex;
124     jjnr             = nlist->jjnr;
125     shiftidx         = nlist->shift;
126     gid              = nlist->gid;
127     shiftvec         = fr->shift_vec[0];
128     fshift           = fr->fshift[0];
129     facel            = _mm_set1_pd(fr->epsfac);
130     charge           = mdatoms->chargeA;
131     nvdwtype         = fr->ntype;
132     vdwparam         = fr->nbfp;
133     vdwtype          = mdatoms->typeA;
134
135     /* Setup water-specific parameters */
136     inr              = nlist->iinr[0];
137     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
138     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
139     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
140     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
141
142     jq1              = _mm_set1_pd(charge[inr+1]);
143     jq2              = _mm_set1_pd(charge[inr+2]);
144     jq3              = _mm_set1_pd(charge[inr+3]);
145     vdwjidx0A        = 2*vdwtype[inr+0];
146     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
147     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
148     qq11             = _mm_mul_pd(iq1,jq1);
149     qq12             = _mm_mul_pd(iq1,jq2);
150     qq13             = _mm_mul_pd(iq1,jq3);
151     qq21             = _mm_mul_pd(iq2,jq1);
152     qq22             = _mm_mul_pd(iq2,jq2);
153     qq23             = _mm_mul_pd(iq2,jq3);
154     qq31             = _mm_mul_pd(iq3,jq1);
155     qq32             = _mm_mul_pd(iq3,jq2);
156     qq33             = _mm_mul_pd(iq3,jq3);
157
158     /* Avoid stupid compiler warnings */
159     jnrA = jnrB = 0;
160     j_coord_offsetA = 0;
161     j_coord_offsetB = 0;
162
163     outeriter        = 0;
164     inneriter        = 0;
165
166     /* Start outer loop over neighborlists */
167     for(iidx=0; iidx<nri; iidx++)
168     {
169         /* Load shift vector for this list */
170         i_shift_offset   = DIM*shiftidx[iidx];
171
172         /* Load limits for loop over neighbors */
173         j_index_start    = jindex[iidx];
174         j_index_end      = jindex[iidx+1];
175
176         /* Get outer coordinate index */
177         inr              = iinr[iidx];
178         i_coord_offset   = DIM*inr;
179
180         /* Load i particle coords and add shift vector */
181         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
182                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183
184         fix0             = _mm_setzero_pd();
185         fiy0             = _mm_setzero_pd();
186         fiz0             = _mm_setzero_pd();
187         fix1             = _mm_setzero_pd();
188         fiy1             = _mm_setzero_pd();
189         fiz1             = _mm_setzero_pd();
190         fix2             = _mm_setzero_pd();
191         fiy2             = _mm_setzero_pd();
192         fiz2             = _mm_setzero_pd();
193         fix3             = _mm_setzero_pd();
194         fiy3             = _mm_setzero_pd();
195         fiz3             = _mm_setzero_pd();
196
197         /* Reset potential sums */
198         velecsum         = _mm_setzero_pd();
199         vvdwsum          = _mm_setzero_pd();
200
201         /* Start inner kernel loop */
202         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
203         {
204
205             /* Get j neighbor index, and coordinate index */
206             jnrA             = jjnr[jidx];
207             jnrB             = jjnr[jidx+1];
208             j_coord_offsetA  = DIM*jnrA;
209             j_coord_offsetB  = DIM*jnrB;
210
211             /* load j atom coordinates */
212             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
213                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
214                                               &jy2,&jz2,&jx3,&jy3,&jz3);
215
216             /* Calculate displacement vector */
217             dx00             = _mm_sub_pd(ix0,jx0);
218             dy00             = _mm_sub_pd(iy0,jy0);
219             dz00             = _mm_sub_pd(iz0,jz0);
220             dx11             = _mm_sub_pd(ix1,jx1);
221             dy11             = _mm_sub_pd(iy1,jy1);
222             dz11             = _mm_sub_pd(iz1,jz1);
223             dx12             = _mm_sub_pd(ix1,jx2);
224             dy12             = _mm_sub_pd(iy1,jy2);
225             dz12             = _mm_sub_pd(iz1,jz2);
226             dx13             = _mm_sub_pd(ix1,jx3);
227             dy13             = _mm_sub_pd(iy1,jy3);
228             dz13             = _mm_sub_pd(iz1,jz3);
229             dx21             = _mm_sub_pd(ix2,jx1);
230             dy21             = _mm_sub_pd(iy2,jy1);
231             dz21             = _mm_sub_pd(iz2,jz1);
232             dx22             = _mm_sub_pd(ix2,jx2);
233             dy22             = _mm_sub_pd(iy2,jy2);
234             dz22             = _mm_sub_pd(iz2,jz2);
235             dx23             = _mm_sub_pd(ix2,jx3);
236             dy23             = _mm_sub_pd(iy2,jy3);
237             dz23             = _mm_sub_pd(iz2,jz3);
238             dx31             = _mm_sub_pd(ix3,jx1);
239             dy31             = _mm_sub_pd(iy3,jy1);
240             dz31             = _mm_sub_pd(iz3,jz1);
241             dx32             = _mm_sub_pd(ix3,jx2);
242             dy32             = _mm_sub_pd(iy3,jy2);
243             dz32             = _mm_sub_pd(iz3,jz2);
244             dx33             = _mm_sub_pd(ix3,jx3);
245             dy33             = _mm_sub_pd(iy3,jy3);
246             dz33             = _mm_sub_pd(iz3,jz3);
247
248             /* Calculate squared distance and things based on it */
249             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
250             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
251             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
252             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
253             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
254             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
255             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
256             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
257             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
258             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
259
260             rinv11           = gmx_mm_invsqrt_pd(rsq11);
261             rinv12           = gmx_mm_invsqrt_pd(rsq12);
262             rinv13           = gmx_mm_invsqrt_pd(rsq13);
263             rinv21           = gmx_mm_invsqrt_pd(rsq21);
264             rinv22           = gmx_mm_invsqrt_pd(rsq22);
265             rinv23           = gmx_mm_invsqrt_pd(rsq23);
266             rinv31           = gmx_mm_invsqrt_pd(rsq31);
267             rinv32           = gmx_mm_invsqrt_pd(rsq32);
268             rinv33           = gmx_mm_invsqrt_pd(rsq33);
269
270             rinvsq00         = gmx_mm_inv_pd(rsq00);
271             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
272             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
273             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
274             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
275             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
276             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
277             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
278             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
279             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
280
281             fjx0             = _mm_setzero_pd();
282             fjy0             = _mm_setzero_pd();
283             fjz0             = _mm_setzero_pd();
284             fjx1             = _mm_setzero_pd();
285             fjy1             = _mm_setzero_pd();
286             fjz1             = _mm_setzero_pd();
287             fjx2             = _mm_setzero_pd();
288             fjy2             = _mm_setzero_pd();
289             fjz2             = _mm_setzero_pd();
290             fjx3             = _mm_setzero_pd();
291             fjy3             = _mm_setzero_pd();
292             fjz3             = _mm_setzero_pd();
293
294             /**************************
295              * CALCULATE INTERACTIONS *
296              **************************/
297
298             /* LENNARD-JONES DISPERSION/REPULSION */
299
300             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
301             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
302             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
303             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
304             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
305
306             /* Update potential sum for this i atom from the interaction with this j atom. */
307             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
308
309             fscal            = fvdw;
310
311             /* Calculate temporary vectorial force */
312             tx               = _mm_mul_pd(fscal,dx00);
313             ty               = _mm_mul_pd(fscal,dy00);
314             tz               = _mm_mul_pd(fscal,dz00);
315
316             /* Update vectorial force */
317             fix0             = _mm_add_pd(fix0,tx);
318             fiy0             = _mm_add_pd(fiy0,ty);
319             fiz0             = _mm_add_pd(fiz0,tz);
320
321             fjx0             = _mm_add_pd(fjx0,tx);
322             fjy0             = _mm_add_pd(fjy0,ty);
323             fjz0             = _mm_add_pd(fjz0,tz);
324
325             /**************************
326              * CALCULATE INTERACTIONS *
327              **************************/
328
329             /* COULOMB ELECTROSTATICS */
330             velec            = _mm_mul_pd(qq11,rinv11);
331             felec            = _mm_mul_pd(velec,rinvsq11);
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             velecsum         = _mm_add_pd(velecsum,velec);
335
336             fscal            = felec;
337
338             /* Calculate temporary vectorial force */
339             tx               = _mm_mul_pd(fscal,dx11);
340             ty               = _mm_mul_pd(fscal,dy11);
341             tz               = _mm_mul_pd(fscal,dz11);
342
343             /* Update vectorial force */
344             fix1             = _mm_add_pd(fix1,tx);
345             fiy1             = _mm_add_pd(fiy1,ty);
346             fiz1             = _mm_add_pd(fiz1,tz);
347
348             fjx1             = _mm_add_pd(fjx1,tx);
349             fjy1             = _mm_add_pd(fjy1,ty);
350             fjz1             = _mm_add_pd(fjz1,tz);
351
352             /**************************
353              * CALCULATE INTERACTIONS *
354              **************************/
355
356             /* COULOMB ELECTROSTATICS */
357             velec            = _mm_mul_pd(qq12,rinv12);
358             felec            = _mm_mul_pd(velec,rinvsq12);
359
360             /* Update potential sum for this i atom from the interaction with this j atom. */
361             velecsum         = _mm_add_pd(velecsum,velec);
362
363             fscal            = felec;
364
365             /* Calculate temporary vectorial force */
366             tx               = _mm_mul_pd(fscal,dx12);
367             ty               = _mm_mul_pd(fscal,dy12);
368             tz               = _mm_mul_pd(fscal,dz12);
369
370             /* Update vectorial force */
371             fix1             = _mm_add_pd(fix1,tx);
372             fiy1             = _mm_add_pd(fiy1,ty);
373             fiz1             = _mm_add_pd(fiz1,tz);
374
375             fjx2             = _mm_add_pd(fjx2,tx);
376             fjy2             = _mm_add_pd(fjy2,ty);
377             fjz2             = _mm_add_pd(fjz2,tz);
378
379             /**************************
380              * CALCULATE INTERACTIONS *
381              **************************/
382
383             /* COULOMB ELECTROSTATICS */
384             velec            = _mm_mul_pd(qq13,rinv13);
385             felec            = _mm_mul_pd(velec,rinvsq13);
386
387             /* Update potential sum for this i atom from the interaction with this j atom. */
388             velecsum         = _mm_add_pd(velecsum,velec);
389
390             fscal            = felec;
391
392             /* Calculate temporary vectorial force */
393             tx               = _mm_mul_pd(fscal,dx13);
394             ty               = _mm_mul_pd(fscal,dy13);
395             tz               = _mm_mul_pd(fscal,dz13);
396
397             /* Update vectorial force */
398             fix1             = _mm_add_pd(fix1,tx);
399             fiy1             = _mm_add_pd(fiy1,ty);
400             fiz1             = _mm_add_pd(fiz1,tz);
401
402             fjx3             = _mm_add_pd(fjx3,tx);
403             fjy3             = _mm_add_pd(fjy3,ty);
404             fjz3             = _mm_add_pd(fjz3,tz);
405
406             /**************************
407              * CALCULATE INTERACTIONS *
408              **************************/
409
410             /* COULOMB ELECTROSTATICS */
411             velec            = _mm_mul_pd(qq21,rinv21);
412             felec            = _mm_mul_pd(velec,rinvsq21);
413
414             /* Update potential sum for this i atom from the interaction with this j atom. */
415             velecsum         = _mm_add_pd(velecsum,velec);
416
417             fscal            = felec;
418
419             /* Calculate temporary vectorial force */
420             tx               = _mm_mul_pd(fscal,dx21);
421             ty               = _mm_mul_pd(fscal,dy21);
422             tz               = _mm_mul_pd(fscal,dz21);
423
424             /* Update vectorial force */
425             fix2             = _mm_add_pd(fix2,tx);
426             fiy2             = _mm_add_pd(fiy2,ty);
427             fiz2             = _mm_add_pd(fiz2,tz);
428
429             fjx1             = _mm_add_pd(fjx1,tx);
430             fjy1             = _mm_add_pd(fjy1,ty);
431             fjz1             = _mm_add_pd(fjz1,tz);
432
433             /**************************
434              * CALCULATE INTERACTIONS *
435              **************************/
436
437             /* COULOMB ELECTROSTATICS */
438             velec            = _mm_mul_pd(qq22,rinv22);
439             felec            = _mm_mul_pd(velec,rinvsq22);
440
441             /* Update potential sum for this i atom from the interaction with this j atom. */
442             velecsum         = _mm_add_pd(velecsum,velec);
443
444             fscal            = felec;
445
446             /* Calculate temporary vectorial force */
447             tx               = _mm_mul_pd(fscal,dx22);
448             ty               = _mm_mul_pd(fscal,dy22);
449             tz               = _mm_mul_pd(fscal,dz22);
450
451             /* Update vectorial force */
452             fix2             = _mm_add_pd(fix2,tx);
453             fiy2             = _mm_add_pd(fiy2,ty);
454             fiz2             = _mm_add_pd(fiz2,tz);
455
456             fjx2             = _mm_add_pd(fjx2,tx);
457             fjy2             = _mm_add_pd(fjy2,ty);
458             fjz2             = _mm_add_pd(fjz2,tz);
459
460             /**************************
461              * CALCULATE INTERACTIONS *
462              **************************/
463
464             /* COULOMB ELECTROSTATICS */
465             velec            = _mm_mul_pd(qq23,rinv23);
466             felec            = _mm_mul_pd(velec,rinvsq23);
467
468             /* Update potential sum for this i atom from the interaction with this j atom. */
469             velecsum         = _mm_add_pd(velecsum,velec);
470
471             fscal            = felec;
472
473             /* Calculate temporary vectorial force */
474             tx               = _mm_mul_pd(fscal,dx23);
475             ty               = _mm_mul_pd(fscal,dy23);
476             tz               = _mm_mul_pd(fscal,dz23);
477
478             /* Update vectorial force */
479             fix2             = _mm_add_pd(fix2,tx);
480             fiy2             = _mm_add_pd(fiy2,ty);
481             fiz2             = _mm_add_pd(fiz2,tz);
482
483             fjx3             = _mm_add_pd(fjx3,tx);
484             fjy3             = _mm_add_pd(fjy3,ty);
485             fjz3             = _mm_add_pd(fjz3,tz);
486
487             /**************************
488              * CALCULATE INTERACTIONS *
489              **************************/
490
491             /* COULOMB ELECTROSTATICS */
492             velec            = _mm_mul_pd(qq31,rinv31);
493             felec            = _mm_mul_pd(velec,rinvsq31);
494
495             /* Update potential sum for this i atom from the interaction with this j atom. */
496             velecsum         = _mm_add_pd(velecsum,velec);
497
498             fscal            = felec;
499
500             /* Calculate temporary vectorial force */
501             tx               = _mm_mul_pd(fscal,dx31);
502             ty               = _mm_mul_pd(fscal,dy31);
503             tz               = _mm_mul_pd(fscal,dz31);
504
505             /* Update vectorial force */
506             fix3             = _mm_add_pd(fix3,tx);
507             fiy3             = _mm_add_pd(fiy3,ty);
508             fiz3             = _mm_add_pd(fiz3,tz);
509
510             fjx1             = _mm_add_pd(fjx1,tx);
511             fjy1             = _mm_add_pd(fjy1,ty);
512             fjz1             = _mm_add_pd(fjz1,tz);
513
514             /**************************
515              * CALCULATE INTERACTIONS *
516              **************************/
517
518             /* COULOMB ELECTROSTATICS */
519             velec            = _mm_mul_pd(qq32,rinv32);
520             felec            = _mm_mul_pd(velec,rinvsq32);
521
522             /* Update potential sum for this i atom from the interaction with this j atom. */
523             velecsum         = _mm_add_pd(velecsum,velec);
524
525             fscal            = felec;
526
527             /* Calculate temporary vectorial force */
528             tx               = _mm_mul_pd(fscal,dx32);
529             ty               = _mm_mul_pd(fscal,dy32);
530             tz               = _mm_mul_pd(fscal,dz32);
531
532             /* Update vectorial force */
533             fix3             = _mm_add_pd(fix3,tx);
534             fiy3             = _mm_add_pd(fiy3,ty);
535             fiz3             = _mm_add_pd(fiz3,tz);
536
537             fjx2             = _mm_add_pd(fjx2,tx);
538             fjy2             = _mm_add_pd(fjy2,ty);
539             fjz2             = _mm_add_pd(fjz2,tz);
540
541             /**************************
542              * CALCULATE INTERACTIONS *
543              **************************/
544
545             /* COULOMB ELECTROSTATICS */
546             velec            = _mm_mul_pd(qq33,rinv33);
547             felec            = _mm_mul_pd(velec,rinvsq33);
548
549             /* Update potential sum for this i atom from the interaction with this j atom. */
550             velecsum         = _mm_add_pd(velecsum,velec);
551
552             fscal            = felec;
553
554             /* Calculate temporary vectorial force */
555             tx               = _mm_mul_pd(fscal,dx33);
556             ty               = _mm_mul_pd(fscal,dy33);
557             tz               = _mm_mul_pd(fscal,dz33);
558
559             /* Update vectorial force */
560             fix3             = _mm_add_pd(fix3,tx);
561             fiy3             = _mm_add_pd(fiy3,ty);
562             fiz3             = _mm_add_pd(fiz3,tz);
563
564             fjx3             = _mm_add_pd(fjx3,tx);
565             fjy3             = _mm_add_pd(fjy3,ty);
566             fjz3             = _mm_add_pd(fjz3,tz);
567
568             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
569
570             /* Inner loop uses 287 flops */
571         }
572
573         if(jidx<j_index_end)
574         {
575
576             jnrA             = jjnr[jidx];
577             j_coord_offsetA  = DIM*jnrA;
578
579             /* load j atom coordinates */
580             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
581                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
582                                               &jy2,&jz2,&jx3,&jy3,&jz3);
583
584             /* Calculate displacement vector */
585             dx00             = _mm_sub_pd(ix0,jx0);
586             dy00             = _mm_sub_pd(iy0,jy0);
587             dz00             = _mm_sub_pd(iz0,jz0);
588             dx11             = _mm_sub_pd(ix1,jx1);
589             dy11             = _mm_sub_pd(iy1,jy1);
590             dz11             = _mm_sub_pd(iz1,jz1);
591             dx12             = _mm_sub_pd(ix1,jx2);
592             dy12             = _mm_sub_pd(iy1,jy2);
593             dz12             = _mm_sub_pd(iz1,jz2);
594             dx13             = _mm_sub_pd(ix1,jx3);
595             dy13             = _mm_sub_pd(iy1,jy3);
596             dz13             = _mm_sub_pd(iz1,jz3);
597             dx21             = _mm_sub_pd(ix2,jx1);
598             dy21             = _mm_sub_pd(iy2,jy1);
599             dz21             = _mm_sub_pd(iz2,jz1);
600             dx22             = _mm_sub_pd(ix2,jx2);
601             dy22             = _mm_sub_pd(iy2,jy2);
602             dz22             = _mm_sub_pd(iz2,jz2);
603             dx23             = _mm_sub_pd(ix2,jx3);
604             dy23             = _mm_sub_pd(iy2,jy3);
605             dz23             = _mm_sub_pd(iz2,jz3);
606             dx31             = _mm_sub_pd(ix3,jx1);
607             dy31             = _mm_sub_pd(iy3,jy1);
608             dz31             = _mm_sub_pd(iz3,jz1);
609             dx32             = _mm_sub_pd(ix3,jx2);
610             dy32             = _mm_sub_pd(iy3,jy2);
611             dz32             = _mm_sub_pd(iz3,jz2);
612             dx33             = _mm_sub_pd(ix3,jx3);
613             dy33             = _mm_sub_pd(iy3,jy3);
614             dz33             = _mm_sub_pd(iz3,jz3);
615
616             /* Calculate squared distance and things based on it */
617             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
618             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
619             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
620             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
621             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
622             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
623             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
624             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
625             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
626             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
627
628             rinv11           = gmx_mm_invsqrt_pd(rsq11);
629             rinv12           = gmx_mm_invsqrt_pd(rsq12);
630             rinv13           = gmx_mm_invsqrt_pd(rsq13);
631             rinv21           = gmx_mm_invsqrt_pd(rsq21);
632             rinv22           = gmx_mm_invsqrt_pd(rsq22);
633             rinv23           = gmx_mm_invsqrt_pd(rsq23);
634             rinv31           = gmx_mm_invsqrt_pd(rsq31);
635             rinv32           = gmx_mm_invsqrt_pd(rsq32);
636             rinv33           = gmx_mm_invsqrt_pd(rsq33);
637
638             rinvsq00         = gmx_mm_inv_pd(rsq00);
639             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
640             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
641             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
642             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
643             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
644             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
645             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
646             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
647             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
648
649             fjx0             = _mm_setzero_pd();
650             fjy0             = _mm_setzero_pd();
651             fjz0             = _mm_setzero_pd();
652             fjx1             = _mm_setzero_pd();
653             fjy1             = _mm_setzero_pd();
654             fjz1             = _mm_setzero_pd();
655             fjx2             = _mm_setzero_pd();
656             fjy2             = _mm_setzero_pd();
657             fjz2             = _mm_setzero_pd();
658             fjx3             = _mm_setzero_pd();
659             fjy3             = _mm_setzero_pd();
660             fjz3             = _mm_setzero_pd();
661
662             /**************************
663              * CALCULATE INTERACTIONS *
664              **************************/
665
666             /* LENNARD-JONES DISPERSION/REPULSION */
667
668             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
669             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
670             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
671             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
672             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
673
674             /* Update potential sum for this i atom from the interaction with this j atom. */
675             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
676             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
677
678             fscal            = fvdw;
679
680             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
681
682             /* Calculate temporary vectorial force */
683             tx               = _mm_mul_pd(fscal,dx00);
684             ty               = _mm_mul_pd(fscal,dy00);
685             tz               = _mm_mul_pd(fscal,dz00);
686
687             /* Update vectorial force */
688             fix0             = _mm_add_pd(fix0,tx);
689             fiy0             = _mm_add_pd(fiy0,ty);
690             fiz0             = _mm_add_pd(fiz0,tz);
691
692             fjx0             = _mm_add_pd(fjx0,tx);
693             fjy0             = _mm_add_pd(fjy0,ty);
694             fjz0             = _mm_add_pd(fjz0,tz);
695
696             /**************************
697              * CALCULATE INTERACTIONS *
698              **************************/
699
700             /* COULOMB ELECTROSTATICS */
701             velec            = _mm_mul_pd(qq11,rinv11);
702             felec            = _mm_mul_pd(velec,rinvsq11);
703
704             /* Update potential sum for this i atom from the interaction with this j atom. */
705             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
706             velecsum         = _mm_add_pd(velecsum,velec);
707
708             fscal            = felec;
709
710             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
711
712             /* Calculate temporary vectorial force */
713             tx               = _mm_mul_pd(fscal,dx11);
714             ty               = _mm_mul_pd(fscal,dy11);
715             tz               = _mm_mul_pd(fscal,dz11);
716
717             /* Update vectorial force */
718             fix1             = _mm_add_pd(fix1,tx);
719             fiy1             = _mm_add_pd(fiy1,ty);
720             fiz1             = _mm_add_pd(fiz1,tz);
721
722             fjx1             = _mm_add_pd(fjx1,tx);
723             fjy1             = _mm_add_pd(fjy1,ty);
724             fjz1             = _mm_add_pd(fjz1,tz);
725
726             /**************************
727              * CALCULATE INTERACTIONS *
728              **************************/
729
730             /* COULOMB ELECTROSTATICS */
731             velec            = _mm_mul_pd(qq12,rinv12);
732             felec            = _mm_mul_pd(velec,rinvsq12);
733
734             /* Update potential sum for this i atom from the interaction with this j atom. */
735             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
736             velecsum         = _mm_add_pd(velecsum,velec);
737
738             fscal            = felec;
739
740             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
741
742             /* Calculate temporary vectorial force */
743             tx               = _mm_mul_pd(fscal,dx12);
744             ty               = _mm_mul_pd(fscal,dy12);
745             tz               = _mm_mul_pd(fscal,dz12);
746
747             /* Update vectorial force */
748             fix1             = _mm_add_pd(fix1,tx);
749             fiy1             = _mm_add_pd(fiy1,ty);
750             fiz1             = _mm_add_pd(fiz1,tz);
751
752             fjx2             = _mm_add_pd(fjx2,tx);
753             fjy2             = _mm_add_pd(fjy2,ty);
754             fjz2             = _mm_add_pd(fjz2,tz);
755
756             /**************************
757              * CALCULATE INTERACTIONS *
758              **************************/
759
760             /* COULOMB ELECTROSTATICS */
761             velec            = _mm_mul_pd(qq13,rinv13);
762             felec            = _mm_mul_pd(velec,rinvsq13);
763
764             /* Update potential sum for this i atom from the interaction with this j atom. */
765             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
766             velecsum         = _mm_add_pd(velecsum,velec);
767
768             fscal            = felec;
769
770             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
771
772             /* Calculate temporary vectorial force */
773             tx               = _mm_mul_pd(fscal,dx13);
774             ty               = _mm_mul_pd(fscal,dy13);
775             tz               = _mm_mul_pd(fscal,dz13);
776
777             /* Update vectorial force */
778             fix1             = _mm_add_pd(fix1,tx);
779             fiy1             = _mm_add_pd(fiy1,ty);
780             fiz1             = _mm_add_pd(fiz1,tz);
781
782             fjx3             = _mm_add_pd(fjx3,tx);
783             fjy3             = _mm_add_pd(fjy3,ty);
784             fjz3             = _mm_add_pd(fjz3,tz);
785
786             /**************************
787              * CALCULATE INTERACTIONS *
788              **************************/
789
790             /* COULOMB ELECTROSTATICS */
791             velec            = _mm_mul_pd(qq21,rinv21);
792             felec            = _mm_mul_pd(velec,rinvsq21);
793
794             /* Update potential sum for this i atom from the interaction with this j atom. */
795             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
796             velecsum         = _mm_add_pd(velecsum,velec);
797
798             fscal            = felec;
799
800             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
801
802             /* Calculate temporary vectorial force */
803             tx               = _mm_mul_pd(fscal,dx21);
804             ty               = _mm_mul_pd(fscal,dy21);
805             tz               = _mm_mul_pd(fscal,dz21);
806
807             /* Update vectorial force */
808             fix2             = _mm_add_pd(fix2,tx);
809             fiy2             = _mm_add_pd(fiy2,ty);
810             fiz2             = _mm_add_pd(fiz2,tz);
811
812             fjx1             = _mm_add_pd(fjx1,tx);
813             fjy1             = _mm_add_pd(fjy1,ty);
814             fjz1             = _mm_add_pd(fjz1,tz);
815
816             /**************************
817              * CALCULATE INTERACTIONS *
818              **************************/
819
820             /* COULOMB ELECTROSTATICS */
821             velec            = _mm_mul_pd(qq22,rinv22);
822             felec            = _mm_mul_pd(velec,rinvsq22);
823
824             /* Update potential sum for this i atom from the interaction with this j atom. */
825             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
826             velecsum         = _mm_add_pd(velecsum,velec);
827
828             fscal            = felec;
829
830             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
831
832             /* Calculate temporary vectorial force */
833             tx               = _mm_mul_pd(fscal,dx22);
834             ty               = _mm_mul_pd(fscal,dy22);
835             tz               = _mm_mul_pd(fscal,dz22);
836
837             /* Update vectorial force */
838             fix2             = _mm_add_pd(fix2,tx);
839             fiy2             = _mm_add_pd(fiy2,ty);
840             fiz2             = _mm_add_pd(fiz2,tz);
841
842             fjx2             = _mm_add_pd(fjx2,tx);
843             fjy2             = _mm_add_pd(fjy2,ty);
844             fjz2             = _mm_add_pd(fjz2,tz);
845
846             /**************************
847              * CALCULATE INTERACTIONS *
848              **************************/
849
850             /* COULOMB ELECTROSTATICS */
851             velec            = _mm_mul_pd(qq23,rinv23);
852             felec            = _mm_mul_pd(velec,rinvsq23);
853
854             /* Update potential sum for this i atom from the interaction with this j atom. */
855             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
856             velecsum         = _mm_add_pd(velecsum,velec);
857
858             fscal            = felec;
859
860             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
861
862             /* Calculate temporary vectorial force */
863             tx               = _mm_mul_pd(fscal,dx23);
864             ty               = _mm_mul_pd(fscal,dy23);
865             tz               = _mm_mul_pd(fscal,dz23);
866
867             /* Update vectorial force */
868             fix2             = _mm_add_pd(fix2,tx);
869             fiy2             = _mm_add_pd(fiy2,ty);
870             fiz2             = _mm_add_pd(fiz2,tz);
871
872             fjx3             = _mm_add_pd(fjx3,tx);
873             fjy3             = _mm_add_pd(fjy3,ty);
874             fjz3             = _mm_add_pd(fjz3,tz);
875
876             /**************************
877              * CALCULATE INTERACTIONS *
878              **************************/
879
880             /* COULOMB ELECTROSTATICS */
881             velec            = _mm_mul_pd(qq31,rinv31);
882             felec            = _mm_mul_pd(velec,rinvsq31);
883
884             /* Update potential sum for this i atom from the interaction with this j atom. */
885             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
886             velecsum         = _mm_add_pd(velecsum,velec);
887
888             fscal            = felec;
889
890             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
891
892             /* Calculate temporary vectorial force */
893             tx               = _mm_mul_pd(fscal,dx31);
894             ty               = _mm_mul_pd(fscal,dy31);
895             tz               = _mm_mul_pd(fscal,dz31);
896
897             /* Update vectorial force */
898             fix3             = _mm_add_pd(fix3,tx);
899             fiy3             = _mm_add_pd(fiy3,ty);
900             fiz3             = _mm_add_pd(fiz3,tz);
901
902             fjx1             = _mm_add_pd(fjx1,tx);
903             fjy1             = _mm_add_pd(fjy1,ty);
904             fjz1             = _mm_add_pd(fjz1,tz);
905
906             /**************************
907              * CALCULATE INTERACTIONS *
908              **************************/
909
910             /* COULOMB ELECTROSTATICS */
911             velec            = _mm_mul_pd(qq32,rinv32);
912             felec            = _mm_mul_pd(velec,rinvsq32);
913
914             /* Update potential sum for this i atom from the interaction with this j atom. */
915             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
916             velecsum         = _mm_add_pd(velecsum,velec);
917
918             fscal            = felec;
919
920             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
921
922             /* Calculate temporary vectorial force */
923             tx               = _mm_mul_pd(fscal,dx32);
924             ty               = _mm_mul_pd(fscal,dy32);
925             tz               = _mm_mul_pd(fscal,dz32);
926
927             /* Update vectorial force */
928             fix3             = _mm_add_pd(fix3,tx);
929             fiy3             = _mm_add_pd(fiy3,ty);
930             fiz3             = _mm_add_pd(fiz3,tz);
931
932             fjx2             = _mm_add_pd(fjx2,tx);
933             fjy2             = _mm_add_pd(fjy2,ty);
934             fjz2             = _mm_add_pd(fjz2,tz);
935
936             /**************************
937              * CALCULATE INTERACTIONS *
938              **************************/
939
940             /* COULOMB ELECTROSTATICS */
941             velec            = _mm_mul_pd(qq33,rinv33);
942             felec            = _mm_mul_pd(velec,rinvsq33);
943
944             /* Update potential sum for this i atom from the interaction with this j atom. */
945             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
946             velecsum         = _mm_add_pd(velecsum,velec);
947
948             fscal            = felec;
949
950             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
951
952             /* Calculate temporary vectorial force */
953             tx               = _mm_mul_pd(fscal,dx33);
954             ty               = _mm_mul_pd(fscal,dy33);
955             tz               = _mm_mul_pd(fscal,dz33);
956
957             /* Update vectorial force */
958             fix3             = _mm_add_pd(fix3,tx);
959             fiy3             = _mm_add_pd(fiy3,ty);
960             fiz3             = _mm_add_pd(fiz3,tz);
961
962             fjx3             = _mm_add_pd(fjx3,tx);
963             fjy3             = _mm_add_pd(fjy3,ty);
964             fjz3             = _mm_add_pd(fjz3,tz);
965
966             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
967
968             /* Inner loop uses 287 flops */
969         }
970
971         /* End of innermost loop */
972
973         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
974                                               f+i_coord_offset,fshift+i_shift_offset);
975
976         ggid                        = gid[iidx];
977         /* Update potential energies */
978         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
979         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
980
981         /* Increment number of inner iterations */
982         inneriter                  += j_index_end - j_index_start;
983
984         /* Outer loop uses 26 flops */
985     }
986
987     /* Increment number of outer iterations */
988     outeriter        += nri;
989
990     /* Update outer/inner flops */
991
992     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*287);
993 }
994 /*
995  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double
996  * Electrostatics interaction: Coulomb
997  * VdW interaction:            LennardJones
998  * Geometry:                   Water4-Water4
999  * Calculate force/pot:        Force
1000  */
1001 void
1002 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double
1003                     (t_nblist                    * gmx_restrict       nlist,
1004                      rvec                        * gmx_restrict          xx,
1005                      rvec                        * gmx_restrict          ff,
1006                      t_forcerec                  * gmx_restrict          fr,
1007                      t_mdatoms                   * gmx_restrict     mdatoms,
1008                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1009                      t_nrnb                      * gmx_restrict        nrnb)
1010 {
1011     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1012      * just 0 for non-waters.
1013      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1014      * jnr indices corresponding to data put in the four positions in the SIMD register.
1015      */
1016     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1017     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1018     int              jnrA,jnrB;
1019     int              j_coord_offsetA,j_coord_offsetB;
1020     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1021     real             rcutoff_scalar;
1022     real             *shiftvec,*fshift,*x,*f;
1023     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1024     int              vdwioffset0;
1025     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1026     int              vdwioffset1;
1027     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1028     int              vdwioffset2;
1029     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1030     int              vdwioffset3;
1031     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1032     int              vdwjidx0A,vdwjidx0B;
1033     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1034     int              vdwjidx1A,vdwjidx1B;
1035     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1036     int              vdwjidx2A,vdwjidx2B;
1037     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1038     int              vdwjidx3A,vdwjidx3B;
1039     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1040     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1041     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1042     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1043     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1044     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1045     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1046     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1047     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1048     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1049     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1050     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1051     real             *charge;
1052     int              nvdwtype;
1053     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1054     int              *vdwtype;
1055     real             *vdwparam;
1056     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1057     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1058     __m128d          dummy_mask,cutoff_mask;
1059     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1060     __m128d          one     = _mm_set1_pd(1.0);
1061     __m128d          two     = _mm_set1_pd(2.0);
1062     x                = xx[0];
1063     f                = ff[0];
1064
1065     nri              = nlist->nri;
1066     iinr             = nlist->iinr;
1067     jindex           = nlist->jindex;
1068     jjnr             = nlist->jjnr;
1069     shiftidx         = nlist->shift;
1070     gid              = nlist->gid;
1071     shiftvec         = fr->shift_vec[0];
1072     fshift           = fr->fshift[0];
1073     facel            = _mm_set1_pd(fr->epsfac);
1074     charge           = mdatoms->chargeA;
1075     nvdwtype         = fr->ntype;
1076     vdwparam         = fr->nbfp;
1077     vdwtype          = mdatoms->typeA;
1078
1079     /* Setup water-specific parameters */
1080     inr              = nlist->iinr[0];
1081     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1082     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1083     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1084     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1085
1086     jq1              = _mm_set1_pd(charge[inr+1]);
1087     jq2              = _mm_set1_pd(charge[inr+2]);
1088     jq3              = _mm_set1_pd(charge[inr+3]);
1089     vdwjidx0A        = 2*vdwtype[inr+0];
1090     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1091     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1092     qq11             = _mm_mul_pd(iq1,jq1);
1093     qq12             = _mm_mul_pd(iq1,jq2);
1094     qq13             = _mm_mul_pd(iq1,jq3);
1095     qq21             = _mm_mul_pd(iq2,jq1);
1096     qq22             = _mm_mul_pd(iq2,jq2);
1097     qq23             = _mm_mul_pd(iq2,jq3);
1098     qq31             = _mm_mul_pd(iq3,jq1);
1099     qq32             = _mm_mul_pd(iq3,jq2);
1100     qq33             = _mm_mul_pd(iq3,jq3);
1101
1102     /* Avoid stupid compiler warnings */
1103     jnrA = jnrB = 0;
1104     j_coord_offsetA = 0;
1105     j_coord_offsetB = 0;
1106
1107     outeriter        = 0;
1108     inneriter        = 0;
1109
1110     /* Start outer loop over neighborlists */
1111     for(iidx=0; iidx<nri; iidx++)
1112     {
1113         /* Load shift vector for this list */
1114         i_shift_offset   = DIM*shiftidx[iidx];
1115
1116         /* Load limits for loop over neighbors */
1117         j_index_start    = jindex[iidx];
1118         j_index_end      = jindex[iidx+1];
1119
1120         /* Get outer coordinate index */
1121         inr              = iinr[iidx];
1122         i_coord_offset   = DIM*inr;
1123
1124         /* Load i particle coords and add shift vector */
1125         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1126                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1127
1128         fix0             = _mm_setzero_pd();
1129         fiy0             = _mm_setzero_pd();
1130         fiz0             = _mm_setzero_pd();
1131         fix1             = _mm_setzero_pd();
1132         fiy1             = _mm_setzero_pd();
1133         fiz1             = _mm_setzero_pd();
1134         fix2             = _mm_setzero_pd();
1135         fiy2             = _mm_setzero_pd();
1136         fiz2             = _mm_setzero_pd();
1137         fix3             = _mm_setzero_pd();
1138         fiy3             = _mm_setzero_pd();
1139         fiz3             = _mm_setzero_pd();
1140
1141         /* Start inner kernel loop */
1142         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1143         {
1144
1145             /* Get j neighbor index, and coordinate index */
1146             jnrA             = jjnr[jidx];
1147             jnrB             = jjnr[jidx+1];
1148             j_coord_offsetA  = DIM*jnrA;
1149             j_coord_offsetB  = DIM*jnrB;
1150
1151             /* load j atom coordinates */
1152             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1153                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1154                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1155
1156             /* Calculate displacement vector */
1157             dx00             = _mm_sub_pd(ix0,jx0);
1158             dy00             = _mm_sub_pd(iy0,jy0);
1159             dz00             = _mm_sub_pd(iz0,jz0);
1160             dx11             = _mm_sub_pd(ix1,jx1);
1161             dy11             = _mm_sub_pd(iy1,jy1);
1162             dz11             = _mm_sub_pd(iz1,jz1);
1163             dx12             = _mm_sub_pd(ix1,jx2);
1164             dy12             = _mm_sub_pd(iy1,jy2);
1165             dz12             = _mm_sub_pd(iz1,jz2);
1166             dx13             = _mm_sub_pd(ix1,jx3);
1167             dy13             = _mm_sub_pd(iy1,jy3);
1168             dz13             = _mm_sub_pd(iz1,jz3);
1169             dx21             = _mm_sub_pd(ix2,jx1);
1170             dy21             = _mm_sub_pd(iy2,jy1);
1171             dz21             = _mm_sub_pd(iz2,jz1);
1172             dx22             = _mm_sub_pd(ix2,jx2);
1173             dy22             = _mm_sub_pd(iy2,jy2);
1174             dz22             = _mm_sub_pd(iz2,jz2);
1175             dx23             = _mm_sub_pd(ix2,jx3);
1176             dy23             = _mm_sub_pd(iy2,jy3);
1177             dz23             = _mm_sub_pd(iz2,jz3);
1178             dx31             = _mm_sub_pd(ix3,jx1);
1179             dy31             = _mm_sub_pd(iy3,jy1);
1180             dz31             = _mm_sub_pd(iz3,jz1);
1181             dx32             = _mm_sub_pd(ix3,jx2);
1182             dy32             = _mm_sub_pd(iy3,jy2);
1183             dz32             = _mm_sub_pd(iz3,jz2);
1184             dx33             = _mm_sub_pd(ix3,jx3);
1185             dy33             = _mm_sub_pd(iy3,jy3);
1186             dz33             = _mm_sub_pd(iz3,jz3);
1187
1188             /* Calculate squared distance and things based on it */
1189             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1190             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1191             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1192             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1193             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1194             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1195             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1196             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1197             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1198             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1199
1200             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1201             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1202             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1203             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1204             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1205             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1206             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1207             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1208             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1209
1210             rinvsq00         = gmx_mm_inv_pd(rsq00);
1211             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1212             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1213             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1214             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1215             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1216             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1217             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1218             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1219             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1220
1221             fjx0             = _mm_setzero_pd();
1222             fjy0             = _mm_setzero_pd();
1223             fjz0             = _mm_setzero_pd();
1224             fjx1             = _mm_setzero_pd();
1225             fjy1             = _mm_setzero_pd();
1226             fjz1             = _mm_setzero_pd();
1227             fjx2             = _mm_setzero_pd();
1228             fjy2             = _mm_setzero_pd();
1229             fjz2             = _mm_setzero_pd();
1230             fjx3             = _mm_setzero_pd();
1231             fjy3             = _mm_setzero_pd();
1232             fjz3             = _mm_setzero_pd();
1233
1234             /**************************
1235              * CALCULATE INTERACTIONS *
1236              **************************/
1237
1238             /* LENNARD-JONES DISPERSION/REPULSION */
1239
1240             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1241             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1242
1243             fscal            = fvdw;
1244
1245             /* Calculate temporary vectorial force */
1246             tx               = _mm_mul_pd(fscal,dx00);
1247             ty               = _mm_mul_pd(fscal,dy00);
1248             tz               = _mm_mul_pd(fscal,dz00);
1249
1250             /* Update vectorial force */
1251             fix0             = _mm_add_pd(fix0,tx);
1252             fiy0             = _mm_add_pd(fiy0,ty);
1253             fiz0             = _mm_add_pd(fiz0,tz);
1254
1255             fjx0             = _mm_add_pd(fjx0,tx);
1256             fjy0             = _mm_add_pd(fjy0,ty);
1257             fjz0             = _mm_add_pd(fjz0,tz);
1258
1259             /**************************
1260              * CALCULATE INTERACTIONS *
1261              **************************/
1262
1263             /* COULOMB ELECTROSTATICS */
1264             velec            = _mm_mul_pd(qq11,rinv11);
1265             felec            = _mm_mul_pd(velec,rinvsq11);
1266
1267             fscal            = felec;
1268
1269             /* Calculate temporary vectorial force */
1270             tx               = _mm_mul_pd(fscal,dx11);
1271             ty               = _mm_mul_pd(fscal,dy11);
1272             tz               = _mm_mul_pd(fscal,dz11);
1273
1274             /* Update vectorial force */
1275             fix1             = _mm_add_pd(fix1,tx);
1276             fiy1             = _mm_add_pd(fiy1,ty);
1277             fiz1             = _mm_add_pd(fiz1,tz);
1278
1279             fjx1             = _mm_add_pd(fjx1,tx);
1280             fjy1             = _mm_add_pd(fjy1,ty);
1281             fjz1             = _mm_add_pd(fjz1,tz);
1282
1283             /**************************
1284              * CALCULATE INTERACTIONS *
1285              **************************/
1286
1287             /* COULOMB ELECTROSTATICS */
1288             velec            = _mm_mul_pd(qq12,rinv12);
1289             felec            = _mm_mul_pd(velec,rinvsq12);
1290
1291             fscal            = felec;
1292
1293             /* Calculate temporary vectorial force */
1294             tx               = _mm_mul_pd(fscal,dx12);
1295             ty               = _mm_mul_pd(fscal,dy12);
1296             tz               = _mm_mul_pd(fscal,dz12);
1297
1298             /* Update vectorial force */
1299             fix1             = _mm_add_pd(fix1,tx);
1300             fiy1             = _mm_add_pd(fiy1,ty);
1301             fiz1             = _mm_add_pd(fiz1,tz);
1302
1303             fjx2             = _mm_add_pd(fjx2,tx);
1304             fjy2             = _mm_add_pd(fjy2,ty);
1305             fjz2             = _mm_add_pd(fjz2,tz);
1306
1307             /**************************
1308              * CALCULATE INTERACTIONS *
1309              **************************/
1310
1311             /* COULOMB ELECTROSTATICS */
1312             velec            = _mm_mul_pd(qq13,rinv13);
1313             felec            = _mm_mul_pd(velec,rinvsq13);
1314
1315             fscal            = felec;
1316
1317             /* Calculate temporary vectorial force */
1318             tx               = _mm_mul_pd(fscal,dx13);
1319             ty               = _mm_mul_pd(fscal,dy13);
1320             tz               = _mm_mul_pd(fscal,dz13);
1321
1322             /* Update vectorial force */
1323             fix1             = _mm_add_pd(fix1,tx);
1324             fiy1             = _mm_add_pd(fiy1,ty);
1325             fiz1             = _mm_add_pd(fiz1,tz);
1326
1327             fjx3             = _mm_add_pd(fjx3,tx);
1328             fjy3             = _mm_add_pd(fjy3,ty);
1329             fjz3             = _mm_add_pd(fjz3,tz);
1330
1331             /**************************
1332              * CALCULATE INTERACTIONS *
1333              **************************/
1334
1335             /* COULOMB ELECTROSTATICS */
1336             velec            = _mm_mul_pd(qq21,rinv21);
1337             felec            = _mm_mul_pd(velec,rinvsq21);
1338
1339             fscal            = felec;
1340
1341             /* Calculate temporary vectorial force */
1342             tx               = _mm_mul_pd(fscal,dx21);
1343             ty               = _mm_mul_pd(fscal,dy21);
1344             tz               = _mm_mul_pd(fscal,dz21);
1345
1346             /* Update vectorial force */
1347             fix2             = _mm_add_pd(fix2,tx);
1348             fiy2             = _mm_add_pd(fiy2,ty);
1349             fiz2             = _mm_add_pd(fiz2,tz);
1350
1351             fjx1             = _mm_add_pd(fjx1,tx);
1352             fjy1             = _mm_add_pd(fjy1,ty);
1353             fjz1             = _mm_add_pd(fjz1,tz);
1354
1355             /**************************
1356              * CALCULATE INTERACTIONS *
1357              **************************/
1358
1359             /* COULOMB ELECTROSTATICS */
1360             velec            = _mm_mul_pd(qq22,rinv22);
1361             felec            = _mm_mul_pd(velec,rinvsq22);
1362
1363             fscal            = felec;
1364
1365             /* Calculate temporary vectorial force */
1366             tx               = _mm_mul_pd(fscal,dx22);
1367             ty               = _mm_mul_pd(fscal,dy22);
1368             tz               = _mm_mul_pd(fscal,dz22);
1369
1370             /* Update vectorial force */
1371             fix2             = _mm_add_pd(fix2,tx);
1372             fiy2             = _mm_add_pd(fiy2,ty);
1373             fiz2             = _mm_add_pd(fiz2,tz);
1374
1375             fjx2             = _mm_add_pd(fjx2,tx);
1376             fjy2             = _mm_add_pd(fjy2,ty);
1377             fjz2             = _mm_add_pd(fjz2,tz);
1378
1379             /**************************
1380              * CALCULATE INTERACTIONS *
1381              **************************/
1382
1383             /* COULOMB ELECTROSTATICS */
1384             velec            = _mm_mul_pd(qq23,rinv23);
1385             felec            = _mm_mul_pd(velec,rinvsq23);
1386
1387             fscal            = felec;
1388
1389             /* Calculate temporary vectorial force */
1390             tx               = _mm_mul_pd(fscal,dx23);
1391             ty               = _mm_mul_pd(fscal,dy23);
1392             tz               = _mm_mul_pd(fscal,dz23);
1393
1394             /* Update vectorial force */
1395             fix2             = _mm_add_pd(fix2,tx);
1396             fiy2             = _mm_add_pd(fiy2,ty);
1397             fiz2             = _mm_add_pd(fiz2,tz);
1398
1399             fjx3             = _mm_add_pd(fjx3,tx);
1400             fjy3             = _mm_add_pd(fjy3,ty);
1401             fjz3             = _mm_add_pd(fjz3,tz);
1402
1403             /**************************
1404              * CALCULATE INTERACTIONS *
1405              **************************/
1406
1407             /* COULOMB ELECTROSTATICS */
1408             velec            = _mm_mul_pd(qq31,rinv31);
1409             felec            = _mm_mul_pd(velec,rinvsq31);
1410
1411             fscal            = felec;
1412
1413             /* Calculate temporary vectorial force */
1414             tx               = _mm_mul_pd(fscal,dx31);
1415             ty               = _mm_mul_pd(fscal,dy31);
1416             tz               = _mm_mul_pd(fscal,dz31);
1417
1418             /* Update vectorial force */
1419             fix3             = _mm_add_pd(fix3,tx);
1420             fiy3             = _mm_add_pd(fiy3,ty);
1421             fiz3             = _mm_add_pd(fiz3,tz);
1422
1423             fjx1             = _mm_add_pd(fjx1,tx);
1424             fjy1             = _mm_add_pd(fjy1,ty);
1425             fjz1             = _mm_add_pd(fjz1,tz);
1426
1427             /**************************
1428              * CALCULATE INTERACTIONS *
1429              **************************/
1430
1431             /* COULOMB ELECTROSTATICS */
1432             velec            = _mm_mul_pd(qq32,rinv32);
1433             felec            = _mm_mul_pd(velec,rinvsq32);
1434
1435             fscal            = felec;
1436
1437             /* Calculate temporary vectorial force */
1438             tx               = _mm_mul_pd(fscal,dx32);
1439             ty               = _mm_mul_pd(fscal,dy32);
1440             tz               = _mm_mul_pd(fscal,dz32);
1441
1442             /* Update vectorial force */
1443             fix3             = _mm_add_pd(fix3,tx);
1444             fiy3             = _mm_add_pd(fiy3,ty);
1445             fiz3             = _mm_add_pd(fiz3,tz);
1446
1447             fjx2             = _mm_add_pd(fjx2,tx);
1448             fjy2             = _mm_add_pd(fjy2,ty);
1449             fjz2             = _mm_add_pd(fjz2,tz);
1450
1451             /**************************
1452              * CALCULATE INTERACTIONS *
1453              **************************/
1454
1455             /* COULOMB ELECTROSTATICS */
1456             velec            = _mm_mul_pd(qq33,rinv33);
1457             felec            = _mm_mul_pd(velec,rinvsq33);
1458
1459             fscal            = felec;
1460
1461             /* Calculate temporary vectorial force */
1462             tx               = _mm_mul_pd(fscal,dx33);
1463             ty               = _mm_mul_pd(fscal,dy33);
1464             tz               = _mm_mul_pd(fscal,dz33);
1465
1466             /* Update vectorial force */
1467             fix3             = _mm_add_pd(fix3,tx);
1468             fiy3             = _mm_add_pd(fiy3,ty);
1469             fiz3             = _mm_add_pd(fiz3,tz);
1470
1471             fjx3             = _mm_add_pd(fjx3,tx);
1472             fjy3             = _mm_add_pd(fjy3,ty);
1473             fjz3             = _mm_add_pd(fjz3,tz);
1474
1475             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1476
1477             /* Inner loop uses 273 flops */
1478         }
1479
1480         if(jidx<j_index_end)
1481         {
1482
1483             jnrA             = jjnr[jidx];
1484             j_coord_offsetA  = DIM*jnrA;
1485
1486             /* load j atom coordinates */
1487             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1488                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1489                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1490
1491             /* Calculate displacement vector */
1492             dx00             = _mm_sub_pd(ix0,jx0);
1493             dy00             = _mm_sub_pd(iy0,jy0);
1494             dz00             = _mm_sub_pd(iz0,jz0);
1495             dx11             = _mm_sub_pd(ix1,jx1);
1496             dy11             = _mm_sub_pd(iy1,jy1);
1497             dz11             = _mm_sub_pd(iz1,jz1);
1498             dx12             = _mm_sub_pd(ix1,jx2);
1499             dy12             = _mm_sub_pd(iy1,jy2);
1500             dz12             = _mm_sub_pd(iz1,jz2);
1501             dx13             = _mm_sub_pd(ix1,jx3);
1502             dy13             = _mm_sub_pd(iy1,jy3);
1503             dz13             = _mm_sub_pd(iz1,jz3);
1504             dx21             = _mm_sub_pd(ix2,jx1);
1505             dy21             = _mm_sub_pd(iy2,jy1);
1506             dz21             = _mm_sub_pd(iz2,jz1);
1507             dx22             = _mm_sub_pd(ix2,jx2);
1508             dy22             = _mm_sub_pd(iy2,jy2);
1509             dz22             = _mm_sub_pd(iz2,jz2);
1510             dx23             = _mm_sub_pd(ix2,jx3);
1511             dy23             = _mm_sub_pd(iy2,jy3);
1512             dz23             = _mm_sub_pd(iz2,jz3);
1513             dx31             = _mm_sub_pd(ix3,jx1);
1514             dy31             = _mm_sub_pd(iy3,jy1);
1515             dz31             = _mm_sub_pd(iz3,jz1);
1516             dx32             = _mm_sub_pd(ix3,jx2);
1517             dy32             = _mm_sub_pd(iy3,jy2);
1518             dz32             = _mm_sub_pd(iz3,jz2);
1519             dx33             = _mm_sub_pd(ix3,jx3);
1520             dy33             = _mm_sub_pd(iy3,jy3);
1521             dz33             = _mm_sub_pd(iz3,jz3);
1522
1523             /* Calculate squared distance and things based on it */
1524             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1525             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1526             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1527             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1528             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1529             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1530             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1531             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1532             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1533             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1534
1535             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1536             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1537             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1538             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1539             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1540             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1541             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1542             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1543             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1544
1545             rinvsq00         = gmx_mm_inv_pd(rsq00);
1546             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1547             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1548             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1549             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1550             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1551             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1552             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1553             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1554             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1555
1556             fjx0             = _mm_setzero_pd();
1557             fjy0             = _mm_setzero_pd();
1558             fjz0             = _mm_setzero_pd();
1559             fjx1             = _mm_setzero_pd();
1560             fjy1             = _mm_setzero_pd();
1561             fjz1             = _mm_setzero_pd();
1562             fjx2             = _mm_setzero_pd();
1563             fjy2             = _mm_setzero_pd();
1564             fjz2             = _mm_setzero_pd();
1565             fjx3             = _mm_setzero_pd();
1566             fjy3             = _mm_setzero_pd();
1567             fjz3             = _mm_setzero_pd();
1568
1569             /**************************
1570              * CALCULATE INTERACTIONS *
1571              **************************/
1572
1573             /* LENNARD-JONES DISPERSION/REPULSION */
1574
1575             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1576             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1577
1578             fscal            = fvdw;
1579
1580             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1581
1582             /* Calculate temporary vectorial force */
1583             tx               = _mm_mul_pd(fscal,dx00);
1584             ty               = _mm_mul_pd(fscal,dy00);
1585             tz               = _mm_mul_pd(fscal,dz00);
1586
1587             /* Update vectorial force */
1588             fix0             = _mm_add_pd(fix0,tx);
1589             fiy0             = _mm_add_pd(fiy0,ty);
1590             fiz0             = _mm_add_pd(fiz0,tz);
1591
1592             fjx0             = _mm_add_pd(fjx0,tx);
1593             fjy0             = _mm_add_pd(fjy0,ty);
1594             fjz0             = _mm_add_pd(fjz0,tz);
1595
1596             /**************************
1597              * CALCULATE INTERACTIONS *
1598              **************************/
1599
1600             /* COULOMB ELECTROSTATICS */
1601             velec            = _mm_mul_pd(qq11,rinv11);
1602             felec            = _mm_mul_pd(velec,rinvsq11);
1603
1604             fscal            = felec;
1605
1606             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1607
1608             /* Calculate temporary vectorial force */
1609             tx               = _mm_mul_pd(fscal,dx11);
1610             ty               = _mm_mul_pd(fscal,dy11);
1611             tz               = _mm_mul_pd(fscal,dz11);
1612
1613             /* Update vectorial force */
1614             fix1             = _mm_add_pd(fix1,tx);
1615             fiy1             = _mm_add_pd(fiy1,ty);
1616             fiz1             = _mm_add_pd(fiz1,tz);
1617
1618             fjx1             = _mm_add_pd(fjx1,tx);
1619             fjy1             = _mm_add_pd(fjy1,ty);
1620             fjz1             = _mm_add_pd(fjz1,tz);
1621
1622             /**************************
1623              * CALCULATE INTERACTIONS *
1624              **************************/
1625
1626             /* COULOMB ELECTROSTATICS */
1627             velec            = _mm_mul_pd(qq12,rinv12);
1628             felec            = _mm_mul_pd(velec,rinvsq12);
1629
1630             fscal            = felec;
1631
1632             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1633
1634             /* Calculate temporary vectorial force */
1635             tx               = _mm_mul_pd(fscal,dx12);
1636             ty               = _mm_mul_pd(fscal,dy12);
1637             tz               = _mm_mul_pd(fscal,dz12);
1638
1639             /* Update vectorial force */
1640             fix1             = _mm_add_pd(fix1,tx);
1641             fiy1             = _mm_add_pd(fiy1,ty);
1642             fiz1             = _mm_add_pd(fiz1,tz);
1643
1644             fjx2             = _mm_add_pd(fjx2,tx);
1645             fjy2             = _mm_add_pd(fjy2,ty);
1646             fjz2             = _mm_add_pd(fjz2,tz);
1647
1648             /**************************
1649              * CALCULATE INTERACTIONS *
1650              **************************/
1651
1652             /* COULOMB ELECTROSTATICS */
1653             velec            = _mm_mul_pd(qq13,rinv13);
1654             felec            = _mm_mul_pd(velec,rinvsq13);
1655
1656             fscal            = felec;
1657
1658             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1659
1660             /* Calculate temporary vectorial force */
1661             tx               = _mm_mul_pd(fscal,dx13);
1662             ty               = _mm_mul_pd(fscal,dy13);
1663             tz               = _mm_mul_pd(fscal,dz13);
1664
1665             /* Update vectorial force */
1666             fix1             = _mm_add_pd(fix1,tx);
1667             fiy1             = _mm_add_pd(fiy1,ty);
1668             fiz1             = _mm_add_pd(fiz1,tz);
1669
1670             fjx3             = _mm_add_pd(fjx3,tx);
1671             fjy3             = _mm_add_pd(fjy3,ty);
1672             fjz3             = _mm_add_pd(fjz3,tz);
1673
1674             /**************************
1675              * CALCULATE INTERACTIONS *
1676              **************************/
1677
1678             /* COULOMB ELECTROSTATICS */
1679             velec            = _mm_mul_pd(qq21,rinv21);
1680             felec            = _mm_mul_pd(velec,rinvsq21);
1681
1682             fscal            = felec;
1683
1684             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1685
1686             /* Calculate temporary vectorial force */
1687             tx               = _mm_mul_pd(fscal,dx21);
1688             ty               = _mm_mul_pd(fscal,dy21);
1689             tz               = _mm_mul_pd(fscal,dz21);
1690
1691             /* Update vectorial force */
1692             fix2             = _mm_add_pd(fix2,tx);
1693             fiy2             = _mm_add_pd(fiy2,ty);
1694             fiz2             = _mm_add_pd(fiz2,tz);
1695
1696             fjx1             = _mm_add_pd(fjx1,tx);
1697             fjy1             = _mm_add_pd(fjy1,ty);
1698             fjz1             = _mm_add_pd(fjz1,tz);
1699
1700             /**************************
1701              * CALCULATE INTERACTIONS *
1702              **************************/
1703
1704             /* COULOMB ELECTROSTATICS */
1705             velec            = _mm_mul_pd(qq22,rinv22);
1706             felec            = _mm_mul_pd(velec,rinvsq22);
1707
1708             fscal            = felec;
1709
1710             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1711
1712             /* Calculate temporary vectorial force */
1713             tx               = _mm_mul_pd(fscal,dx22);
1714             ty               = _mm_mul_pd(fscal,dy22);
1715             tz               = _mm_mul_pd(fscal,dz22);
1716
1717             /* Update vectorial force */
1718             fix2             = _mm_add_pd(fix2,tx);
1719             fiy2             = _mm_add_pd(fiy2,ty);
1720             fiz2             = _mm_add_pd(fiz2,tz);
1721
1722             fjx2             = _mm_add_pd(fjx2,tx);
1723             fjy2             = _mm_add_pd(fjy2,ty);
1724             fjz2             = _mm_add_pd(fjz2,tz);
1725
1726             /**************************
1727              * CALCULATE INTERACTIONS *
1728              **************************/
1729
1730             /* COULOMB ELECTROSTATICS */
1731             velec            = _mm_mul_pd(qq23,rinv23);
1732             felec            = _mm_mul_pd(velec,rinvsq23);
1733
1734             fscal            = felec;
1735
1736             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1737
1738             /* Calculate temporary vectorial force */
1739             tx               = _mm_mul_pd(fscal,dx23);
1740             ty               = _mm_mul_pd(fscal,dy23);
1741             tz               = _mm_mul_pd(fscal,dz23);
1742
1743             /* Update vectorial force */
1744             fix2             = _mm_add_pd(fix2,tx);
1745             fiy2             = _mm_add_pd(fiy2,ty);
1746             fiz2             = _mm_add_pd(fiz2,tz);
1747
1748             fjx3             = _mm_add_pd(fjx3,tx);
1749             fjy3             = _mm_add_pd(fjy3,ty);
1750             fjz3             = _mm_add_pd(fjz3,tz);
1751
1752             /**************************
1753              * CALCULATE INTERACTIONS *
1754              **************************/
1755
1756             /* COULOMB ELECTROSTATICS */
1757             velec            = _mm_mul_pd(qq31,rinv31);
1758             felec            = _mm_mul_pd(velec,rinvsq31);
1759
1760             fscal            = felec;
1761
1762             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1763
1764             /* Calculate temporary vectorial force */
1765             tx               = _mm_mul_pd(fscal,dx31);
1766             ty               = _mm_mul_pd(fscal,dy31);
1767             tz               = _mm_mul_pd(fscal,dz31);
1768
1769             /* Update vectorial force */
1770             fix3             = _mm_add_pd(fix3,tx);
1771             fiy3             = _mm_add_pd(fiy3,ty);
1772             fiz3             = _mm_add_pd(fiz3,tz);
1773
1774             fjx1             = _mm_add_pd(fjx1,tx);
1775             fjy1             = _mm_add_pd(fjy1,ty);
1776             fjz1             = _mm_add_pd(fjz1,tz);
1777
1778             /**************************
1779              * CALCULATE INTERACTIONS *
1780              **************************/
1781
1782             /* COULOMB ELECTROSTATICS */
1783             velec            = _mm_mul_pd(qq32,rinv32);
1784             felec            = _mm_mul_pd(velec,rinvsq32);
1785
1786             fscal            = felec;
1787
1788             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1789
1790             /* Calculate temporary vectorial force */
1791             tx               = _mm_mul_pd(fscal,dx32);
1792             ty               = _mm_mul_pd(fscal,dy32);
1793             tz               = _mm_mul_pd(fscal,dz32);
1794
1795             /* Update vectorial force */
1796             fix3             = _mm_add_pd(fix3,tx);
1797             fiy3             = _mm_add_pd(fiy3,ty);
1798             fiz3             = _mm_add_pd(fiz3,tz);
1799
1800             fjx2             = _mm_add_pd(fjx2,tx);
1801             fjy2             = _mm_add_pd(fjy2,ty);
1802             fjz2             = _mm_add_pd(fjz2,tz);
1803
1804             /**************************
1805              * CALCULATE INTERACTIONS *
1806              **************************/
1807
1808             /* COULOMB ELECTROSTATICS */
1809             velec            = _mm_mul_pd(qq33,rinv33);
1810             felec            = _mm_mul_pd(velec,rinvsq33);
1811
1812             fscal            = felec;
1813
1814             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1815
1816             /* Calculate temporary vectorial force */
1817             tx               = _mm_mul_pd(fscal,dx33);
1818             ty               = _mm_mul_pd(fscal,dy33);
1819             tz               = _mm_mul_pd(fscal,dz33);
1820
1821             /* Update vectorial force */
1822             fix3             = _mm_add_pd(fix3,tx);
1823             fiy3             = _mm_add_pd(fiy3,ty);
1824             fiz3             = _mm_add_pd(fiz3,tz);
1825
1826             fjx3             = _mm_add_pd(fjx3,tx);
1827             fjy3             = _mm_add_pd(fjy3,ty);
1828             fjz3             = _mm_add_pd(fjz3,tz);
1829
1830             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1831
1832             /* Inner loop uses 273 flops */
1833         }
1834
1835         /* End of innermost loop */
1836
1837         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1838                                               f+i_coord_offset,fshift+i_shift_offset);
1839
1840         /* Increment number of inner iterations */
1841         inneriter                  += j_index_end - j_index_start;
1842
1843         /* Outer loop uses 24 flops */
1844     }
1845
1846     /* Increment number of outer iterations */
1847     outeriter        += nri;
1848
1849     /* Update outer/inner flops */
1850
1851     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);
1852 }