Use full path for legacyheaders
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecRF_VdwLJ_GeomW4W4_sse4_1_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_double
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB;
75     int              j_coord_offsetA,j_coord_offsetB;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80     int              vdwioffset0;
81     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82     int              vdwioffset1;
83     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwioffset3;
87     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     int              vdwjidx3A,vdwjidx3B;
95     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
113     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
114     __m128d          dummy_mask,cutoff_mask;
115     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
116     __m128d          one     = _mm_set1_pd(1.0);
117     __m128d          two     = _mm_set1_pd(2.0);
118     x                = xx[0];
119     f                = ff[0];
120
121     nri              = nlist->nri;
122     iinr             = nlist->iinr;
123     jindex           = nlist->jindex;
124     jjnr             = nlist->jjnr;
125     shiftidx         = nlist->shift;
126     gid              = nlist->gid;
127     shiftvec         = fr->shift_vec[0];
128     fshift           = fr->fshift[0];
129     facel            = _mm_set1_pd(fr->epsfac);
130     charge           = mdatoms->chargeA;
131     krf              = _mm_set1_pd(fr->ic->k_rf);
132     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
133     crf              = _mm_set1_pd(fr->ic->c_rf);
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     /* Setup water-specific parameters */
139     inr              = nlist->iinr[0];
140     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
141     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
142     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
143     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
144
145     jq1              = _mm_set1_pd(charge[inr+1]);
146     jq2              = _mm_set1_pd(charge[inr+2]);
147     jq3              = _mm_set1_pd(charge[inr+3]);
148     vdwjidx0A        = 2*vdwtype[inr+0];
149     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
150     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
151     qq11             = _mm_mul_pd(iq1,jq1);
152     qq12             = _mm_mul_pd(iq1,jq2);
153     qq13             = _mm_mul_pd(iq1,jq3);
154     qq21             = _mm_mul_pd(iq2,jq1);
155     qq22             = _mm_mul_pd(iq2,jq2);
156     qq23             = _mm_mul_pd(iq2,jq3);
157     qq31             = _mm_mul_pd(iq3,jq1);
158     qq32             = _mm_mul_pd(iq3,jq2);
159     qq33             = _mm_mul_pd(iq3,jq3);
160
161     /* Avoid stupid compiler warnings */
162     jnrA = jnrB = 0;
163     j_coord_offsetA = 0;
164     j_coord_offsetB = 0;
165
166     outeriter        = 0;
167     inneriter        = 0;
168
169     /* Start outer loop over neighborlists */
170     for(iidx=0; iidx<nri; iidx++)
171     {
172         /* Load shift vector for this list */
173         i_shift_offset   = DIM*shiftidx[iidx];
174
175         /* Load limits for loop over neighbors */
176         j_index_start    = jindex[iidx];
177         j_index_end      = jindex[iidx+1];
178
179         /* Get outer coordinate index */
180         inr              = iinr[iidx];
181         i_coord_offset   = DIM*inr;
182
183         /* Load i particle coords and add shift vector */
184         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
185                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
186
187         fix0             = _mm_setzero_pd();
188         fiy0             = _mm_setzero_pd();
189         fiz0             = _mm_setzero_pd();
190         fix1             = _mm_setzero_pd();
191         fiy1             = _mm_setzero_pd();
192         fiz1             = _mm_setzero_pd();
193         fix2             = _mm_setzero_pd();
194         fiy2             = _mm_setzero_pd();
195         fiz2             = _mm_setzero_pd();
196         fix3             = _mm_setzero_pd();
197         fiy3             = _mm_setzero_pd();
198         fiz3             = _mm_setzero_pd();
199
200         /* Reset potential sums */
201         velecsum         = _mm_setzero_pd();
202         vvdwsum          = _mm_setzero_pd();
203
204         /* Start inner kernel loop */
205         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
206         {
207
208             /* Get j neighbor index, and coordinate index */
209             jnrA             = jjnr[jidx];
210             jnrB             = jjnr[jidx+1];
211             j_coord_offsetA  = DIM*jnrA;
212             j_coord_offsetB  = DIM*jnrB;
213
214             /* load j atom coordinates */
215             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
216                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
217                                               &jy2,&jz2,&jx3,&jy3,&jz3);
218
219             /* Calculate displacement vector */
220             dx00             = _mm_sub_pd(ix0,jx0);
221             dy00             = _mm_sub_pd(iy0,jy0);
222             dz00             = _mm_sub_pd(iz0,jz0);
223             dx11             = _mm_sub_pd(ix1,jx1);
224             dy11             = _mm_sub_pd(iy1,jy1);
225             dz11             = _mm_sub_pd(iz1,jz1);
226             dx12             = _mm_sub_pd(ix1,jx2);
227             dy12             = _mm_sub_pd(iy1,jy2);
228             dz12             = _mm_sub_pd(iz1,jz2);
229             dx13             = _mm_sub_pd(ix1,jx3);
230             dy13             = _mm_sub_pd(iy1,jy3);
231             dz13             = _mm_sub_pd(iz1,jz3);
232             dx21             = _mm_sub_pd(ix2,jx1);
233             dy21             = _mm_sub_pd(iy2,jy1);
234             dz21             = _mm_sub_pd(iz2,jz1);
235             dx22             = _mm_sub_pd(ix2,jx2);
236             dy22             = _mm_sub_pd(iy2,jy2);
237             dz22             = _mm_sub_pd(iz2,jz2);
238             dx23             = _mm_sub_pd(ix2,jx3);
239             dy23             = _mm_sub_pd(iy2,jy3);
240             dz23             = _mm_sub_pd(iz2,jz3);
241             dx31             = _mm_sub_pd(ix3,jx1);
242             dy31             = _mm_sub_pd(iy3,jy1);
243             dz31             = _mm_sub_pd(iz3,jz1);
244             dx32             = _mm_sub_pd(ix3,jx2);
245             dy32             = _mm_sub_pd(iy3,jy2);
246             dz32             = _mm_sub_pd(iz3,jz2);
247             dx33             = _mm_sub_pd(ix3,jx3);
248             dy33             = _mm_sub_pd(iy3,jy3);
249             dz33             = _mm_sub_pd(iz3,jz3);
250
251             /* Calculate squared distance and things based on it */
252             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
253             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
254             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
255             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
256             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
257             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
258             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
259             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
260             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
261             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
262
263             rinv11           = gmx_mm_invsqrt_pd(rsq11);
264             rinv12           = gmx_mm_invsqrt_pd(rsq12);
265             rinv13           = gmx_mm_invsqrt_pd(rsq13);
266             rinv21           = gmx_mm_invsqrt_pd(rsq21);
267             rinv22           = gmx_mm_invsqrt_pd(rsq22);
268             rinv23           = gmx_mm_invsqrt_pd(rsq23);
269             rinv31           = gmx_mm_invsqrt_pd(rsq31);
270             rinv32           = gmx_mm_invsqrt_pd(rsq32);
271             rinv33           = gmx_mm_invsqrt_pd(rsq33);
272
273             rinvsq00         = gmx_mm_inv_pd(rsq00);
274             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
275             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
276             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
277             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
278             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
279             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
280             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
281             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
282             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
283
284             fjx0             = _mm_setzero_pd();
285             fjy0             = _mm_setzero_pd();
286             fjz0             = _mm_setzero_pd();
287             fjx1             = _mm_setzero_pd();
288             fjy1             = _mm_setzero_pd();
289             fjz1             = _mm_setzero_pd();
290             fjx2             = _mm_setzero_pd();
291             fjy2             = _mm_setzero_pd();
292             fjz2             = _mm_setzero_pd();
293             fjx3             = _mm_setzero_pd();
294             fjy3             = _mm_setzero_pd();
295             fjz3             = _mm_setzero_pd();
296
297             /**************************
298              * CALCULATE INTERACTIONS *
299              **************************/
300
301             /* LENNARD-JONES DISPERSION/REPULSION */
302
303             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
304             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
305             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
306             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
307             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
308
309             /* Update potential sum for this i atom from the interaction with this j atom. */
310             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
311
312             fscal            = fvdw;
313
314             /* Calculate temporary vectorial force */
315             tx               = _mm_mul_pd(fscal,dx00);
316             ty               = _mm_mul_pd(fscal,dy00);
317             tz               = _mm_mul_pd(fscal,dz00);
318
319             /* Update vectorial force */
320             fix0             = _mm_add_pd(fix0,tx);
321             fiy0             = _mm_add_pd(fiy0,ty);
322             fiz0             = _mm_add_pd(fiz0,tz);
323
324             fjx0             = _mm_add_pd(fjx0,tx);
325             fjy0             = _mm_add_pd(fjy0,ty);
326             fjz0             = _mm_add_pd(fjz0,tz);
327
328             /**************************
329              * CALCULATE INTERACTIONS *
330              **************************/
331
332             /* REACTION-FIELD ELECTROSTATICS */
333             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
334             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
335
336             /* Update potential sum for this i atom from the interaction with this j atom. */
337             velecsum         = _mm_add_pd(velecsum,velec);
338
339             fscal            = felec;
340
341             /* Calculate temporary vectorial force */
342             tx               = _mm_mul_pd(fscal,dx11);
343             ty               = _mm_mul_pd(fscal,dy11);
344             tz               = _mm_mul_pd(fscal,dz11);
345
346             /* Update vectorial force */
347             fix1             = _mm_add_pd(fix1,tx);
348             fiy1             = _mm_add_pd(fiy1,ty);
349             fiz1             = _mm_add_pd(fiz1,tz);
350
351             fjx1             = _mm_add_pd(fjx1,tx);
352             fjy1             = _mm_add_pd(fjy1,ty);
353             fjz1             = _mm_add_pd(fjz1,tz);
354
355             /**************************
356              * CALCULATE INTERACTIONS *
357              **************************/
358
359             /* REACTION-FIELD ELECTROSTATICS */
360             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
361             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
362
363             /* Update potential sum for this i atom from the interaction with this j atom. */
364             velecsum         = _mm_add_pd(velecsum,velec);
365
366             fscal            = felec;
367
368             /* Calculate temporary vectorial force */
369             tx               = _mm_mul_pd(fscal,dx12);
370             ty               = _mm_mul_pd(fscal,dy12);
371             tz               = _mm_mul_pd(fscal,dz12);
372
373             /* Update vectorial force */
374             fix1             = _mm_add_pd(fix1,tx);
375             fiy1             = _mm_add_pd(fiy1,ty);
376             fiz1             = _mm_add_pd(fiz1,tz);
377
378             fjx2             = _mm_add_pd(fjx2,tx);
379             fjy2             = _mm_add_pd(fjy2,ty);
380             fjz2             = _mm_add_pd(fjz2,tz);
381
382             /**************************
383              * CALCULATE INTERACTIONS *
384              **************************/
385
386             /* REACTION-FIELD ELECTROSTATICS */
387             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
388             felec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
389
390             /* Update potential sum for this i atom from the interaction with this j atom. */
391             velecsum         = _mm_add_pd(velecsum,velec);
392
393             fscal            = felec;
394
395             /* Calculate temporary vectorial force */
396             tx               = _mm_mul_pd(fscal,dx13);
397             ty               = _mm_mul_pd(fscal,dy13);
398             tz               = _mm_mul_pd(fscal,dz13);
399
400             /* Update vectorial force */
401             fix1             = _mm_add_pd(fix1,tx);
402             fiy1             = _mm_add_pd(fiy1,ty);
403             fiz1             = _mm_add_pd(fiz1,tz);
404
405             fjx3             = _mm_add_pd(fjx3,tx);
406             fjy3             = _mm_add_pd(fjy3,ty);
407             fjz3             = _mm_add_pd(fjz3,tz);
408
409             /**************************
410              * CALCULATE INTERACTIONS *
411              **************************/
412
413             /* REACTION-FIELD ELECTROSTATICS */
414             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
415             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
416
417             /* Update potential sum for this i atom from the interaction with this j atom. */
418             velecsum         = _mm_add_pd(velecsum,velec);
419
420             fscal            = felec;
421
422             /* Calculate temporary vectorial force */
423             tx               = _mm_mul_pd(fscal,dx21);
424             ty               = _mm_mul_pd(fscal,dy21);
425             tz               = _mm_mul_pd(fscal,dz21);
426
427             /* Update vectorial force */
428             fix2             = _mm_add_pd(fix2,tx);
429             fiy2             = _mm_add_pd(fiy2,ty);
430             fiz2             = _mm_add_pd(fiz2,tz);
431
432             fjx1             = _mm_add_pd(fjx1,tx);
433             fjy1             = _mm_add_pd(fjy1,ty);
434             fjz1             = _mm_add_pd(fjz1,tz);
435
436             /**************************
437              * CALCULATE INTERACTIONS *
438              **************************/
439
440             /* REACTION-FIELD ELECTROSTATICS */
441             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
442             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
443
444             /* Update potential sum for this i atom from the interaction with this j atom. */
445             velecsum         = _mm_add_pd(velecsum,velec);
446
447             fscal            = felec;
448
449             /* Calculate temporary vectorial force */
450             tx               = _mm_mul_pd(fscal,dx22);
451             ty               = _mm_mul_pd(fscal,dy22);
452             tz               = _mm_mul_pd(fscal,dz22);
453
454             /* Update vectorial force */
455             fix2             = _mm_add_pd(fix2,tx);
456             fiy2             = _mm_add_pd(fiy2,ty);
457             fiz2             = _mm_add_pd(fiz2,tz);
458
459             fjx2             = _mm_add_pd(fjx2,tx);
460             fjy2             = _mm_add_pd(fjy2,ty);
461             fjz2             = _mm_add_pd(fjz2,tz);
462
463             /**************************
464              * CALCULATE INTERACTIONS *
465              **************************/
466
467             /* REACTION-FIELD ELECTROSTATICS */
468             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
469             felec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
470
471             /* Update potential sum for this i atom from the interaction with this j atom. */
472             velecsum         = _mm_add_pd(velecsum,velec);
473
474             fscal            = felec;
475
476             /* Calculate temporary vectorial force */
477             tx               = _mm_mul_pd(fscal,dx23);
478             ty               = _mm_mul_pd(fscal,dy23);
479             tz               = _mm_mul_pd(fscal,dz23);
480
481             /* Update vectorial force */
482             fix2             = _mm_add_pd(fix2,tx);
483             fiy2             = _mm_add_pd(fiy2,ty);
484             fiz2             = _mm_add_pd(fiz2,tz);
485
486             fjx3             = _mm_add_pd(fjx3,tx);
487             fjy3             = _mm_add_pd(fjy3,ty);
488             fjz3             = _mm_add_pd(fjz3,tz);
489
490             /**************************
491              * CALCULATE INTERACTIONS *
492              **************************/
493
494             /* REACTION-FIELD ELECTROSTATICS */
495             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
496             felec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
497
498             /* Update potential sum for this i atom from the interaction with this j atom. */
499             velecsum         = _mm_add_pd(velecsum,velec);
500
501             fscal            = felec;
502
503             /* Calculate temporary vectorial force */
504             tx               = _mm_mul_pd(fscal,dx31);
505             ty               = _mm_mul_pd(fscal,dy31);
506             tz               = _mm_mul_pd(fscal,dz31);
507
508             /* Update vectorial force */
509             fix3             = _mm_add_pd(fix3,tx);
510             fiy3             = _mm_add_pd(fiy3,ty);
511             fiz3             = _mm_add_pd(fiz3,tz);
512
513             fjx1             = _mm_add_pd(fjx1,tx);
514             fjy1             = _mm_add_pd(fjy1,ty);
515             fjz1             = _mm_add_pd(fjz1,tz);
516
517             /**************************
518              * CALCULATE INTERACTIONS *
519              **************************/
520
521             /* REACTION-FIELD ELECTROSTATICS */
522             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
523             felec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
524
525             /* Update potential sum for this i atom from the interaction with this j atom. */
526             velecsum         = _mm_add_pd(velecsum,velec);
527
528             fscal            = felec;
529
530             /* Calculate temporary vectorial force */
531             tx               = _mm_mul_pd(fscal,dx32);
532             ty               = _mm_mul_pd(fscal,dy32);
533             tz               = _mm_mul_pd(fscal,dz32);
534
535             /* Update vectorial force */
536             fix3             = _mm_add_pd(fix3,tx);
537             fiy3             = _mm_add_pd(fiy3,ty);
538             fiz3             = _mm_add_pd(fiz3,tz);
539
540             fjx2             = _mm_add_pd(fjx2,tx);
541             fjy2             = _mm_add_pd(fjy2,ty);
542             fjz2             = _mm_add_pd(fjz2,tz);
543
544             /**************************
545              * CALCULATE INTERACTIONS *
546              **************************/
547
548             /* REACTION-FIELD ELECTROSTATICS */
549             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
550             felec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
551
552             /* Update potential sum for this i atom from the interaction with this j atom. */
553             velecsum         = _mm_add_pd(velecsum,velec);
554
555             fscal            = felec;
556
557             /* Calculate temporary vectorial force */
558             tx               = _mm_mul_pd(fscal,dx33);
559             ty               = _mm_mul_pd(fscal,dy33);
560             tz               = _mm_mul_pd(fscal,dz33);
561
562             /* Update vectorial force */
563             fix3             = _mm_add_pd(fix3,tx);
564             fiy3             = _mm_add_pd(fiy3,ty);
565             fiz3             = _mm_add_pd(fiz3,tz);
566
567             fjx3             = _mm_add_pd(fjx3,tx);
568             fjy3             = _mm_add_pd(fjy3,ty);
569             fjz3             = _mm_add_pd(fjz3,tz);
570
571             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
572
573             /* Inner loop uses 323 flops */
574         }
575
576         if(jidx<j_index_end)
577         {
578
579             jnrA             = jjnr[jidx];
580             j_coord_offsetA  = DIM*jnrA;
581
582             /* load j atom coordinates */
583             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
584                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
585                                               &jy2,&jz2,&jx3,&jy3,&jz3);
586
587             /* Calculate displacement vector */
588             dx00             = _mm_sub_pd(ix0,jx0);
589             dy00             = _mm_sub_pd(iy0,jy0);
590             dz00             = _mm_sub_pd(iz0,jz0);
591             dx11             = _mm_sub_pd(ix1,jx1);
592             dy11             = _mm_sub_pd(iy1,jy1);
593             dz11             = _mm_sub_pd(iz1,jz1);
594             dx12             = _mm_sub_pd(ix1,jx2);
595             dy12             = _mm_sub_pd(iy1,jy2);
596             dz12             = _mm_sub_pd(iz1,jz2);
597             dx13             = _mm_sub_pd(ix1,jx3);
598             dy13             = _mm_sub_pd(iy1,jy3);
599             dz13             = _mm_sub_pd(iz1,jz3);
600             dx21             = _mm_sub_pd(ix2,jx1);
601             dy21             = _mm_sub_pd(iy2,jy1);
602             dz21             = _mm_sub_pd(iz2,jz1);
603             dx22             = _mm_sub_pd(ix2,jx2);
604             dy22             = _mm_sub_pd(iy2,jy2);
605             dz22             = _mm_sub_pd(iz2,jz2);
606             dx23             = _mm_sub_pd(ix2,jx3);
607             dy23             = _mm_sub_pd(iy2,jy3);
608             dz23             = _mm_sub_pd(iz2,jz3);
609             dx31             = _mm_sub_pd(ix3,jx1);
610             dy31             = _mm_sub_pd(iy3,jy1);
611             dz31             = _mm_sub_pd(iz3,jz1);
612             dx32             = _mm_sub_pd(ix3,jx2);
613             dy32             = _mm_sub_pd(iy3,jy2);
614             dz32             = _mm_sub_pd(iz3,jz2);
615             dx33             = _mm_sub_pd(ix3,jx3);
616             dy33             = _mm_sub_pd(iy3,jy3);
617             dz33             = _mm_sub_pd(iz3,jz3);
618
619             /* Calculate squared distance and things based on it */
620             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
621             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
622             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
623             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
624             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
625             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
626             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
627             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
628             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
629             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
630
631             rinv11           = gmx_mm_invsqrt_pd(rsq11);
632             rinv12           = gmx_mm_invsqrt_pd(rsq12);
633             rinv13           = gmx_mm_invsqrt_pd(rsq13);
634             rinv21           = gmx_mm_invsqrt_pd(rsq21);
635             rinv22           = gmx_mm_invsqrt_pd(rsq22);
636             rinv23           = gmx_mm_invsqrt_pd(rsq23);
637             rinv31           = gmx_mm_invsqrt_pd(rsq31);
638             rinv32           = gmx_mm_invsqrt_pd(rsq32);
639             rinv33           = gmx_mm_invsqrt_pd(rsq33);
640
641             rinvsq00         = gmx_mm_inv_pd(rsq00);
642             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
643             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
644             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
645             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
646             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
647             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
648             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
649             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
650             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
651
652             fjx0             = _mm_setzero_pd();
653             fjy0             = _mm_setzero_pd();
654             fjz0             = _mm_setzero_pd();
655             fjx1             = _mm_setzero_pd();
656             fjy1             = _mm_setzero_pd();
657             fjz1             = _mm_setzero_pd();
658             fjx2             = _mm_setzero_pd();
659             fjy2             = _mm_setzero_pd();
660             fjz2             = _mm_setzero_pd();
661             fjx3             = _mm_setzero_pd();
662             fjy3             = _mm_setzero_pd();
663             fjz3             = _mm_setzero_pd();
664
665             /**************************
666              * CALCULATE INTERACTIONS *
667              **************************/
668
669             /* LENNARD-JONES DISPERSION/REPULSION */
670
671             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
672             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
673             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
674             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
675             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
676
677             /* Update potential sum for this i atom from the interaction with this j atom. */
678             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
679             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
680
681             fscal            = fvdw;
682
683             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
684
685             /* Calculate temporary vectorial force */
686             tx               = _mm_mul_pd(fscal,dx00);
687             ty               = _mm_mul_pd(fscal,dy00);
688             tz               = _mm_mul_pd(fscal,dz00);
689
690             /* Update vectorial force */
691             fix0             = _mm_add_pd(fix0,tx);
692             fiy0             = _mm_add_pd(fiy0,ty);
693             fiz0             = _mm_add_pd(fiz0,tz);
694
695             fjx0             = _mm_add_pd(fjx0,tx);
696             fjy0             = _mm_add_pd(fjy0,ty);
697             fjz0             = _mm_add_pd(fjz0,tz);
698
699             /**************************
700              * CALCULATE INTERACTIONS *
701              **************************/
702
703             /* REACTION-FIELD ELECTROSTATICS */
704             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
705             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
706
707             /* Update potential sum for this i atom from the interaction with this j atom. */
708             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
709             velecsum         = _mm_add_pd(velecsum,velec);
710
711             fscal            = felec;
712
713             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
714
715             /* Calculate temporary vectorial force */
716             tx               = _mm_mul_pd(fscal,dx11);
717             ty               = _mm_mul_pd(fscal,dy11);
718             tz               = _mm_mul_pd(fscal,dz11);
719
720             /* Update vectorial force */
721             fix1             = _mm_add_pd(fix1,tx);
722             fiy1             = _mm_add_pd(fiy1,ty);
723             fiz1             = _mm_add_pd(fiz1,tz);
724
725             fjx1             = _mm_add_pd(fjx1,tx);
726             fjy1             = _mm_add_pd(fjy1,ty);
727             fjz1             = _mm_add_pd(fjz1,tz);
728
729             /**************************
730              * CALCULATE INTERACTIONS *
731              **************************/
732
733             /* REACTION-FIELD ELECTROSTATICS */
734             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
735             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
736
737             /* Update potential sum for this i atom from the interaction with this j atom. */
738             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
739             velecsum         = _mm_add_pd(velecsum,velec);
740
741             fscal            = felec;
742
743             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
744
745             /* Calculate temporary vectorial force */
746             tx               = _mm_mul_pd(fscal,dx12);
747             ty               = _mm_mul_pd(fscal,dy12);
748             tz               = _mm_mul_pd(fscal,dz12);
749
750             /* Update vectorial force */
751             fix1             = _mm_add_pd(fix1,tx);
752             fiy1             = _mm_add_pd(fiy1,ty);
753             fiz1             = _mm_add_pd(fiz1,tz);
754
755             fjx2             = _mm_add_pd(fjx2,tx);
756             fjy2             = _mm_add_pd(fjy2,ty);
757             fjz2             = _mm_add_pd(fjz2,tz);
758
759             /**************************
760              * CALCULATE INTERACTIONS *
761              **************************/
762
763             /* REACTION-FIELD ELECTROSTATICS */
764             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
765             felec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
766
767             /* Update potential sum for this i atom from the interaction with this j atom. */
768             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
769             velecsum         = _mm_add_pd(velecsum,velec);
770
771             fscal            = felec;
772
773             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
774
775             /* Calculate temporary vectorial force */
776             tx               = _mm_mul_pd(fscal,dx13);
777             ty               = _mm_mul_pd(fscal,dy13);
778             tz               = _mm_mul_pd(fscal,dz13);
779
780             /* Update vectorial force */
781             fix1             = _mm_add_pd(fix1,tx);
782             fiy1             = _mm_add_pd(fiy1,ty);
783             fiz1             = _mm_add_pd(fiz1,tz);
784
785             fjx3             = _mm_add_pd(fjx3,tx);
786             fjy3             = _mm_add_pd(fjy3,ty);
787             fjz3             = _mm_add_pd(fjz3,tz);
788
789             /**************************
790              * CALCULATE INTERACTIONS *
791              **************************/
792
793             /* REACTION-FIELD ELECTROSTATICS */
794             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
795             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
796
797             /* Update potential sum for this i atom from the interaction with this j atom. */
798             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
799             velecsum         = _mm_add_pd(velecsum,velec);
800
801             fscal            = felec;
802
803             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
804
805             /* Calculate temporary vectorial force */
806             tx               = _mm_mul_pd(fscal,dx21);
807             ty               = _mm_mul_pd(fscal,dy21);
808             tz               = _mm_mul_pd(fscal,dz21);
809
810             /* Update vectorial force */
811             fix2             = _mm_add_pd(fix2,tx);
812             fiy2             = _mm_add_pd(fiy2,ty);
813             fiz2             = _mm_add_pd(fiz2,tz);
814
815             fjx1             = _mm_add_pd(fjx1,tx);
816             fjy1             = _mm_add_pd(fjy1,ty);
817             fjz1             = _mm_add_pd(fjz1,tz);
818
819             /**************************
820              * CALCULATE INTERACTIONS *
821              **************************/
822
823             /* REACTION-FIELD ELECTROSTATICS */
824             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
825             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
826
827             /* Update potential sum for this i atom from the interaction with this j atom. */
828             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
829             velecsum         = _mm_add_pd(velecsum,velec);
830
831             fscal            = felec;
832
833             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
834
835             /* Calculate temporary vectorial force */
836             tx               = _mm_mul_pd(fscal,dx22);
837             ty               = _mm_mul_pd(fscal,dy22);
838             tz               = _mm_mul_pd(fscal,dz22);
839
840             /* Update vectorial force */
841             fix2             = _mm_add_pd(fix2,tx);
842             fiy2             = _mm_add_pd(fiy2,ty);
843             fiz2             = _mm_add_pd(fiz2,tz);
844
845             fjx2             = _mm_add_pd(fjx2,tx);
846             fjy2             = _mm_add_pd(fjy2,ty);
847             fjz2             = _mm_add_pd(fjz2,tz);
848
849             /**************************
850              * CALCULATE INTERACTIONS *
851              **************************/
852
853             /* REACTION-FIELD ELECTROSTATICS */
854             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
855             felec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
856
857             /* Update potential sum for this i atom from the interaction with this j atom. */
858             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
859             velecsum         = _mm_add_pd(velecsum,velec);
860
861             fscal            = felec;
862
863             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
864
865             /* Calculate temporary vectorial force */
866             tx               = _mm_mul_pd(fscal,dx23);
867             ty               = _mm_mul_pd(fscal,dy23);
868             tz               = _mm_mul_pd(fscal,dz23);
869
870             /* Update vectorial force */
871             fix2             = _mm_add_pd(fix2,tx);
872             fiy2             = _mm_add_pd(fiy2,ty);
873             fiz2             = _mm_add_pd(fiz2,tz);
874
875             fjx3             = _mm_add_pd(fjx3,tx);
876             fjy3             = _mm_add_pd(fjy3,ty);
877             fjz3             = _mm_add_pd(fjz3,tz);
878
879             /**************************
880              * CALCULATE INTERACTIONS *
881              **************************/
882
883             /* REACTION-FIELD ELECTROSTATICS */
884             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
885             felec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
886
887             /* Update potential sum for this i atom from the interaction with this j atom. */
888             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
889             velecsum         = _mm_add_pd(velecsum,velec);
890
891             fscal            = felec;
892
893             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
894
895             /* Calculate temporary vectorial force */
896             tx               = _mm_mul_pd(fscal,dx31);
897             ty               = _mm_mul_pd(fscal,dy31);
898             tz               = _mm_mul_pd(fscal,dz31);
899
900             /* Update vectorial force */
901             fix3             = _mm_add_pd(fix3,tx);
902             fiy3             = _mm_add_pd(fiy3,ty);
903             fiz3             = _mm_add_pd(fiz3,tz);
904
905             fjx1             = _mm_add_pd(fjx1,tx);
906             fjy1             = _mm_add_pd(fjy1,ty);
907             fjz1             = _mm_add_pd(fjz1,tz);
908
909             /**************************
910              * CALCULATE INTERACTIONS *
911              **************************/
912
913             /* REACTION-FIELD ELECTROSTATICS */
914             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
915             felec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
916
917             /* Update potential sum for this i atom from the interaction with this j atom. */
918             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
919             velecsum         = _mm_add_pd(velecsum,velec);
920
921             fscal            = felec;
922
923             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
924
925             /* Calculate temporary vectorial force */
926             tx               = _mm_mul_pd(fscal,dx32);
927             ty               = _mm_mul_pd(fscal,dy32);
928             tz               = _mm_mul_pd(fscal,dz32);
929
930             /* Update vectorial force */
931             fix3             = _mm_add_pd(fix3,tx);
932             fiy3             = _mm_add_pd(fiy3,ty);
933             fiz3             = _mm_add_pd(fiz3,tz);
934
935             fjx2             = _mm_add_pd(fjx2,tx);
936             fjy2             = _mm_add_pd(fjy2,ty);
937             fjz2             = _mm_add_pd(fjz2,tz);
938
939             /**************************
940              * CALCULATE INTERACTIONS *
941              **************************/
942
943             /* REACTION-FIELD ELECTROSTATICS */
944             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
945             felec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
946
947             /* Update potential sum for this i atom from the interaction with this j atom. */
948             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
949             velecsum         = _mm_add_pd(velecsum,velec);
950
951             fscal            = felec;
952
953             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
954
955             /* Calculate temporary vectorial force */
956             tx               = _mm_mul_pd(fscal,dx33);
957             ty               = _mm_mul_pd(fscal,dy33);
958             tz               = _mm_mul_pd(fscal,dz33);
959
960             /* Update vectorial force */
961             fix3             = _mm_add_pd(fix3,tx);
962             fiy3             = _mm_add_pd(fiy3,ty);
963             fiz3             = _mm_add_pd(fiz3,tz);
964
965             fjx3             = _mm_add_pd(fjx3,tx);
966             fjy3             = _mm_add_pd(fjy3,ty);
967             fjz3             = _mm_add_pd(fjz3,tz);
968
969             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
970
971             /* Inner loop uses 323 flops */
972         }
973
974         /* End of innermost loop */
975
976         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
977                                               f+i_coord_offset,fshift+i_shift_offset);
978
979         ggid                        = gid[iidx];
980         /* Update potential energies */
981         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
982         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
983
984         /* Increment number of inner iterations */
985         inneriter                  += j_index_end - j_index_start;
986
987         /* Outer loop uses 26 flops */
988     }
989
990     /* Increment number of outer iterations */
991     outeriter        += nri;
992
993     /* Update outer/inner flops */
994
995     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*323);
996 }
997 /*
998  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_double
999  * Electrostatics interaction: ReactionField
1000  * VdW interaction:            LennardJones
1001  * Geometry:                   Water4-Water4
1002  * Calculate force/pot:        Force
1003  */
1004 void
1005 nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_double
1006                     (t_nblist                    * gmx_restrict       nlist,
1007                      rvec                        * gmx_restrict          xx,
1008                      rvec                        * gmx_restrict          ff,
1009                      t_forcerec                  * gmx_restrict          fr,
1010                      t_mdatoms                   * gmx_restrict     mdatoms,
1011                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1012                      t_nrnb                      * gmx_restrict        nrnb)
1013 {
1014     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1015      * just 0 for non-waters.
1016      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1017      * jnr indices corresponding to data put in the four positions in the SIMD register.
1018      */
1019     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1020     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1021     int              jnrA,jnrB;
1022     int              j_coord_offsetA,j_coord_offsetB;
1023     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1024     real             rcutoff_scalar;
1025     real             *shiftvec,*fshift,*x,*f;
1026     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1027     int              vdwioffset0;
1028     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1029     int              vdwioffset1;
1030     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1031     int              vdwioffset2;
1032     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1033     int              vdwioffset3;
1034     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1035     int              vdwjidx0A,vdwjidx0B;
1036     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1037     int              vdwjidx1A,vdwjidx1B;
1038     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1039     int              vdwjidx2A,vdwjidx2B;
1040     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1041     int              vdwjidx3A,vdwjidx3B;
1042     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1043     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1044     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1045     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1046     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1047     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1048     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1049     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1050     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1051     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1052     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1053     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1054     real             *charge;
1055     int              nvdwtype;
1056     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1057     int              *vdwtype;
1058     real             *vdwparam;
1059     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1060     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1061     __m128d          dummy_mask,cutoff_mask;
1062     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1063     __m128d          one     = _mm_set1_pd(1.0);
1064     __m128d          two     = _mm_set1_pd(2.0);
1065     x                = xx[0];
1066     f                = ff[0];
1067
1068     nri              = nlist->nri;
1069     iinr             = nlist->iinr;
1070     jindex           = nlist->jindex;
1071     jjnr             = nlist->jjnr;
1072     shiftidx         = nlist->shift;
1073     gid              = nlist->gid;
1074     shiftvec         = fr->shift_vec[0];
1075     fshift           = fr->fshift[0];
1076     facel            = _mm_set1_pd(fr->epsfac);
1077     charge           = mdatoms->chargeA;
1078     krf              = _mm_set1_pd(fr->ic->k_rf);
1079     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
1080     crf              = _mm_set1_pd(fr->ic->c_rf);
1081     nvdwtype         = fr->ntype;
1082     vdwparam         = fr->nbfp;
1083     vdwtype          = mdatoms->typeA;
1084
1085     /* Setup water-specific parameters */
1086     inr              = nlist->iinr[0];
1087     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1088     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1089     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1090     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1091
1092     jq1              = _mm_set1_pd(charge[inr+1]);
1093     jq2              = _mm_set1_pd(charge[inr+2]);
1094     jq3              = _mm_set1_pd(charge[inr+3]);
1095     vdwjidx0A        = 2*vdwtype[inr+0];
1096     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1097     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1098     qq11             = _mm_mul_pd(iq1,jq1);
1099     qq12             = _mm_mul_pd(iq1,jq2);
1100     qq13             = _mm_mul_pd(iq1,jq3);
1101     qq21             = _mm_mul_pd(iq2,jq1);
1102     qq22             = _mm_mul_pd(iq2,jq2);
1103     qq23             = _mm_mul_pd(iq2,jq3);
1104     qq31             = _mm_mul_pd(iq3,jq1);
1105     qq32             = _mm_mul_pd(iq3,jq2);
1106     qq33             = _mm_mul_pd(iq3,jq3);
1107
1108     /* Avoid stupid compiler warnings */
1109     jnrA = jnrB = 0;
1110     j_coord_offsetA = 0;
1111     j_coord_offsetB = 0;
1112
1113     outeriter        = 0;
1114     inneriter        = 0;
1115
1116     /* Start outer loop over neighborlists */
1117     for(iidx=0; iidx<nri; iidx++)
1118     {
1119         /* Load shift vector for this list */
1120         i_shift_offset   = DIM*shiftidx[iidx];
1121
1122         /* Load limits for loop over neighbors */
1123         j_index_start    = jindex[iidx];
1124         j_index_end      = jindex[iidx+1];
1125
1126         /* Get outer coordinate index */
1127         inr              = iinr[iidx];
1128         i_coord_offset   = DIM*inr;
1129
1130         /* Load i particle coords and add shift vector */
1131         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1132                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1133
1134         fix0             = _mm_setzero_pd();
1135         fiy0             = _mm_setzero_pd();
1136         fiz0             = _mm_setzero_pd();
1137         fix1             = _mm_setzero_pd();
1138         fiy1             = _mm_setzero_pd();
1139         fiz1             = _mm_setzero_pd();
1140         fix2             = _mm_setzero_pd();
1141         fiy2             = _mm_setzero_pd();
1142         fiz2             = _mm_setzero_pd();
1143         fix3             = _mm_setzero_pd();
1144         fiy3             = _mm_setzero_pd();
1145         fiz3             = _mm_setzero_pd();
1146
1147         /* Start inner kernel loop */
1148         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1149         {
1150
1151             /* Get j neighbor index, and coordinate index */
1152             jnrA             = jjnr[jidx];
1153             jnrB             = jjnr[jidx+1];
1154             j_coord_offsetA  = DIM*jnrA;
1155             j_coord_offsetB  = DIM*jnrB;
1156
1157             /* load j atom coordinates */
1158             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1159                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1160                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1161
1162             /* Calculate displacement vector */
1163             dx00             = _mm_sub_pd(ix0,jx0);
1164             dy00             = _mm_sub_pd(iy0,jy0);
1165             dz00             = _mm_sub_pd(iz0,jz0);
1166             dx11             = _mm_sub_pd(ix1,jx1);
1167             dy11             = _mm_sub_pd(iy1,jy1);
1168             dz11             = _mm_sub_pd(iz1,jz1);
1169             dx12             = _mm_sub_pd(ix1,jx2);
1170             dy12             = _mm_sub_pd(iy1,jy2);
1171             dz12             = _mm_sub_pd(iz1,jz2);
1172             dx13             = _mm_sub_pd(ix1,jx3);
1173             dy13             = _mm_sub_pd(iy1,jy3);
1174             dz13             = _mm_sub_pd(iz1,jz3);
1175             dx21             = _mm_sub_pd(ix2,jx1);
1176             dy21             = _mm_sub_pd(iy2,jy1);
1177             dz21             = _mm_sub_pd(iz2,jz1);
1178             dx22             = _mm_sub_pd(ix2,jx2);
1179             dy22             = _mm_sub_pd(iy2,jy2);
1180             dz22             = _mm_sub_pd(iz2,jz2);
1181             dx23             = _mm_sub_pd(ix2,jx3);
1182             dy23             = _mm_sub_pd(iy2,jy3);
1183             dz23             = _mm_sub_pd(iz2,jz3);
1184             dx31             = _mm_sub_pd(ix3,jx1);
1185             dy31             = _mm_sub_pd(iy3,jy1);
1186             dz31             = _mm_sub_pd(iz3,jz1);
1187             dx32             = _mm_sub_pd(ix3,jx2);
1188             dy32             = _mm_sub_pd(iy3,jy2);
1189             dz32             = _mm_sub_pd(iz3,jz2);
1190             dx33             = _mm_sub_pd(ix3,jx3);
1191             dy33             = _mm_sub_pd(iy3,jy3);
1192             dz33             = _mm_sub_pd(iz3,jz3);
1193
1194             /* Calculate squared distance and things based on it */
1195             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1196             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1197             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1198             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1199             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1200             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1201             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1202             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1203             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1204             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1205
1206             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1207             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1208             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1209             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1210             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1211             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1212             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1213             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1214             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1215
1216             rinvsq00         = gmx_mm_inv_pd(rsq00);
1217             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1218             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1219             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1220             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1221             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1222             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1223             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1224             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1225             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1226
1227             fjx0             = _mm_setzero_pd();
1228             fjy0             = _mm_setzero_pd();
1229             fjz0             = _mm_setzero_pd();
1230             fjx1             = _mm_setzero_pd();
1231             fjy1             = _mm_setzero_pd();
1232             fjz1             = _mm_setzero_pd();
1233             fjx2             = _mm_setzero_pd();
1234             fjy2             = _mm_setzero_pd();
1235             fjz2             = _mm_setzero_pd();
1236             fjx3             = _mm_setzero_pd();
1237             fjy3             = _mm_setzero_pd();
1238             fjz3             = _mm_setzero_pd();
1239
1240             /**************************
1241              * CALCULATE INTERACTIONS *
1242              **************************/
1243
1244             /* LENNARD-JONES DISPERSION/REPULSION */
1245
1246             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1247             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1248
1249             fscal            = fvdw;
1250
1251             /* Calculate temporary vectorial force */
1252             tx               = _mm_mul_pd(fscal,dx00);
1253             ty               = _mm_mul_pd(fscal,dy00);
1254             tz               = _mm_mul_pd(fscal,dz00);
1255
1256             /* Update vectorial force */
1257             fix0             = _mm_add_pd(fix0,tx);
1258             fiy0             = _mm_add_pd(fiy0,ty);
1259             fiz0             = _mm_add_pd(fiz0,tz);
1260
1261             fjx0             = _mm_add_pd(fjx0,tx);
1262             fjy0             = _mm_add_pd(fjy0,ty);
1263             fjz0             = _mm_add_pd(fjz0,tz);
1264
1265             /**************************
1266              * CALCULATE INTERACTIONS *
1267              **************************/
1268
1269             /* REACTION-FIELD ELECTROSTATICS */
1270             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1271
1272             fscal            = felec;
1273
1274             /* Calculate temporary vectorial force */
1275             tx               = _mm_mul_pd(fscal,dx11);
1276             ty               = _mm_mul_pd(fscal,dy11);
1277             tz               = _mm_mul_pd(fscal,dz11);
1278
1279             /* Update vectorial force */
1280             fix1             = _mm_add_pd(fix1,tx);
1281             fiy1             = _mm_add_pd(fiy1,ty);
1282             fiz1             = _mm_add_pd(fiz1,tz);
1283
1284             fjx1             = _mm_add_pd(fjx1,tx);
1285             fjy1             = _mm_add_pd(fjy1,ty);
1286             fjz1             = _mm_add_pd(fjz1,tz);
1287
1288             /**************************
1289              * CALCULATE INTERACTIONS *
1290              **************************/
1291
1292             /* REACTION-FIELD ELECTROSTATICS */
1293             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1294
1295             fscal            = felec;
1296
1297             /* Calculate temporary vectorial force */
1298             tx               = _mm_mul_pd(fscal,dx12);
1299             ty               = _mm_mul_pd(fscal,dy12);
1300             tz               = _mm_mul_pd(fscal,dz12);
1301
1302             /* Update vectorial force */
1303             fix1             = _mm_add_pd(fix1,tx);
1304             fiy1             = _mm_add_pd(fiy1,ty);
1305             fiz1             = _mm_add_pd(fiz1,tz);
1306
1307             fjx2             = _mm_add_pd(fjx2,tx);
1308             fjy2             = _mm_add_pd(fjy2,ty);
1309             fjz2             = _mm_add_pd(fjz2,tz);
1310
1311             /**************************
1312              * CALCULATE INTERACTIONS *
1313              **************************/
1314
1315             /* REACTION-FIELD ELECTROSTATICS */
1316             felec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1317
1318             fscal            = felec;
1319
1320             /* Calculate temporary vectorial force */
1321             tx               = _mm_mul_pd(fscal,dx13);
1322             ty               = _mm_mul_pd(fscal,dy13);
1323             tz               = _mm_mul_pd(fscal,dz13);
1324
1325             /* Update vectorial force */
1326             fix1             = _mm_add_pd(fix1,tx);
1327             fiy1             = _mm_add_pd(fiy1,ty);
1328             fiz1             = _mm_add_pd(fiz1,tz);
1329
1330             fjx3             = _mm_add_pd(fjx3,tx);
1331             fjy3             = _mm_add_pd(fjy3,ty);
1332             fjz3             = _mm_add_pd(fjz3,tz);
1333
1334             /**************************
1335              * CALCULATE INTERACTIONS *
1336              **************************/
1337
1338             /* REACTION-FIELD ELECTROSTATICS */
1339             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1340
1341             fscal            = felec;
1342
1343             /* Calculate temporary vectorial force */
1344             tx               = _mm_mul_pd(fscal,dx21);
1345             ty               = _mm_mul_pd(fscal,dy21);
1346             tz               = _mm_mul_pd(fscal,dz21);
1347
1348             /* Update vectorial force */
1349             fix2             = _mm_add_pd(fix2,tx);
1350             fiy2             = _mm_add_pd(fiy2,ty);
1351             fiz2             = _mm_add_pd(fiz2,tz);
1352
1353             fjx1             = _mm_add_pd(fjx1,tx);
1354             fjy1             = _mm_add_pd(fjy1,ty);
1355             fjz1             = _mm_add_pd(fjz1,tz);
1356
1357             /**************************
1358              * CALCULATE INTERACTIONS *
1359              **************************/
1360
1361             /* REACTION-FIELD ELECTROSTATICS */
1362             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1363
1364             fscal            = felec;
1365
1366             /* Calculate temporary vectorial force */
1367             tx               = _mm_mul_pd(fscal,dx22);
1368             ty               = _mm_mul_pd(fscal,dy22);
1369             tz               = _mm_mul_pd(fscal,dz22);
1370
1371             /* Update vectorial force */
1372             fix2             = _mm_add_pd(fix2,tx);
1373             fiy2             = _mm_add_pd(fiy2,ty);
1374             fiz2             = _mm_add_pd(fiz2,tz);
1375
1376             fjx2             = _mm_add_pd(fjx2,tx);
1377             fjy2             = _mm_add_pd(fjy2,ty);
1378             fjz2             = _mm_add_pd(fjz2,tz);
1379
1380             /**************************
1381              * CALCULATE INTERACTIONS *
1382              **************************/
1383
1384             /* REACTION-FIELD ELECTROSTATICS */
1385             felec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1386
1387             fscal            = felec;
1388
1389             /* Calculate temporary vectorial force */
1390             tx               = _mm_mul_pd(fscal,dx23);
1391             ty               = _mm_mul_pd(fscal,dy23);
1392             tz               = _mm_mul_pd(fscal,dz23);
1393
1394             /* Update vectorial force */
1395             fix2             = _mm_add_pd(fix2,tx);
1396             fiy2             = _mm_add_pd(fiy2,ty);
1397             fiz2             = _mm_add_pd(fiz2,tz);
1398
1399             fjx3             = _mm_add_pd(fjx3,tx);
1400             fjy3             = _mm_add_pd(fjy3,ty);
1401             fjz3             = _mm_add_pd(fjz3,tz);
1402
1403             /**************************
1404              * CALCULATE INTERACTIONS *
1405              **************************/
1406
1407             /* REACTION-FIELD ELECTROSTATICS */
1408             felec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1409
1410             fscal            = felec;
1411
1412             /* Calculate temporary vectorial force */
1413             tx               = _mm_mul_pd(fscal,dx31);
1414             ty               = _mm_mul_pd(fscal,dy31);
1415             tz               = _mm_mul_pd(fscal,dz31);
1416
1417             /* Update vectorial force */
1418             fix3             = _mm_add_pd(fix3,tx);
1419             fiy3             = _mm_add_pd(fiy3,ty);
1420             fiz3             = _mm_add_pd(fiz3,tz);
1421
1422             fjx1             = _mm_add_pd(fjx1,tx);
1423             fjy1             = _mm_add_pd(fjy1,ty);
1424             fjz1             = _mm_add_pd(fjz1,tz);
1425
1426             /**************************
1427              * CALCULATE INTERACTIONS *
1428              **************************/
1429
1430             /* REACTION-FIELD ELECTROSTATICS */
1431             felec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1432
1433             fscal            = felec;
1434
1435             /* Calculate temporary vectorial force */
1436             tx               = _mm_mul_pd(fscal,dx32);
1437             ty               = _mm_mul_pd(fscal,dy32);
1438             tz               = _mm_mul_pd(fscal,dz32);
1439
1440             /* Update vectorial force */
1441             fix3             = _mm_add_pd(fix3,tx);
1442             fiy3             = _mm_add_pd(fiy3,ty);
1443             fiz3             = _mm_add_pd(fiz3,tz);
1444
1445             fjx2             = _mm_add_pd(fjx2,tx);
1446             fjy2             = _mm_add_pd(fjy2,ty);
1447             fjz2             = _mm_add_pd(fjz2,tz);
1448
1449             /**************************
1450              * CALCULATE INTERACTIONS *
1451              **************************/
1452
1453             /* REACTION-FIELD ELECTROSTATICS */
1454             felec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1455
1456             fscal            = felec;
1457
1458             /* Calculate temporary vectorial force */
1459             tx               = _mm_mul_pd(fscal,dx33);
1460             ty               = _mm_mul_pd(fscal,dy33);
1461             tz               = _mm_mul_pd(fscal,dz33);
1462
1463             /* Update vectorial force */
1464             fix3             = _mm_add_pd(fix3,tx);
1465             fiy3             = _mm_add_pd(fiy3,ty);
1466             fiz3             = _mm_add_pd(fiz3,tz);
1467
1468             fjx3             = _mm_add_pd(fjx3,tx);
1469             fjy3             = _mm_add_pd(fjy3,ty);
1470             fjz3             = _mm_add_pd(fjz3,tz);
1471
1472             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1473
1474             /* Inner loop uses 273 flops */
1475         }
1476
1477         if(jidx<j_index_end)
1478         {
1479
1480             jnrA             = jjnr[jidx];
1481             j_coord_offsetA  = DIM*jnrA;
1482
1483             /* load j atom coordinates */
1484             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1485                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1486                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1487
1488             /* Calculate displacement vector */
1489             dx00             = _mm_sub_pd(ix0,jx0);
1490             dy00             = _mm_sub_pd(iy0,jy0);
1491             dz00             = _mm_sub_pd(iz0,jz0);
1492             dx11             = _mm_sub_pd(ix1,jx1);
1493             dy11             = _mm_sub_pd(iy1,jy1);
1494             dz11             = _mm_sub_pd(iz1,jz1);
1495             dx12             = _mm_sub_pd(ix1,jx2);
1496             dy12             = _mm_sub_pd(iy1,jy2);
1497             dz12             = _mm_sub_pd(iz1,jz2);
1498             dx13             = _mm_sub_pd(ix1,jx3);
1499             dy13             = _mm_sub_pd(iy1,jy3);
1500             dz13             = _mm_sub_pd(iz1,jz3);
1501             dx21             = _mm_sub_pd(ix2,jx1);
1502             dy21             = _mm_sub_pd(iy2,jy1);
1503             dz21             = _mm_sub_pd(iz2,jz1);
1504             dx22             = _mm_sub_pd(ix2,jx2);
1505             dy22             = _mm_sub_pd(iy2,jy2);
1506             dz22             = _mm_sub_pd(iz2,jz2);
1507             dx23             = _mm_sub_pd(ix2,jx3);
1508             dy23             = _mm_sub_pd(iy2,jy3);
1509             dz23             = _mm_sub_pd(iz2,jz3);
1510             dx31             = _mm_sub_pd(ix3,jx1);
1511             dy31             = _mm_sub_pd(iy3,jy1);
1512             dz31             = _mm_sub_pd(iz3,jz1);
1513             dx32             = _mm_sub_pd(ix3,jx2);
1514             dy32             = _mm_sub_pd(iy3,jy2);
1515             dz32             = _mm_sub_pd(iz3,jz2);
1516             dx33             = _mm_sub_pd(ix3,jx3);
1517             dy33             = _mm_sub_pd(iy3,jy3);
1518             dz33             = _mm_sub_pd(iz3,jz3);
1519
1520             /* Calculate squared distance and things based on it */
1521             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1522             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1523             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1524             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1525             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1526             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1527             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1528             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1529             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1530             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1531
1532             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1533             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1534             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1535             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1536             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1537             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1538             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1539             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1540             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1541
1542             rinvsq00         = gmx_mm_inv_pd(rsq00);
1543             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1544             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1545             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1546             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1547             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1548             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1549             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1550             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1551             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1552
1553             fjx0             = _mm_setzero_pd();
1554             fjy0             = _mm_setzero_pd();
1555             fjz0             = _mm_setzero_pd();
1556             fjx1             = _mm_setzero_pd();
1557             fjy1             = _mm_setzero_pd();
1558             fjz1             = _mm_setzero_pd();
1559             fjx2             = _mm_setzero_pd();
1560             fjy2             = _mm_setzero_pd();
1561             fjz2             = _mm_setzero_pd();
1562             fjx3             = _mm_setzero_pd();
1563             fjy3             = _mm_setzero_pd();
1564             fjz3             = _mm_setzero_pd();
1565
1566             /**************************
1567              * CALCULATE INTERACTIONS *
1568              **************************/
1569
1570             /* LENNARD-JONES DISPERSION/REPULSION */
1571
1572             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1573             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1574
1575             fscal            = fvdw;
1576
1577             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1578
1579             /* Calculate temporary vectorial force */
1580             tx               = _mm_mul_pd(fscal,dx00);
1581             ty               = _mm_mul_pd(fscal,dy00);
1582             tz               = _mm_mul_pd(fscal,dz00);
1583
1584             /* Update vectorial force */
1585             fix0             = _mm_add_pd(fix0,tx);
1586             fiy0             = _mm_add_pd(fiy0,ty);
1587             fiz0             = _mm_add_pd(fiz0,tz);
1588
1589             fjx0             = _mm_add_pd(fjx0,tx);
1590             fjy0             = _mm_add_pd(fjy0,ty);
1591             fjz0             = _mm_add_pd(fjz0,tz);
1592
1593             /**************************
1594              * CALCULATE INTERACTIONS *
1595              **************************/
1596
1597             /* REACTION-FIELD ELECTROSTATICS */
1598             felec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1599
1600             fscal            = felec;
1601
1602             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1603
1604             /* Calculate temporary vectorial force */
1605             tx               = _mm_mul_pd(fscal,dx11);
1606             ty               = _mm_mul_pd(fscal,dy11);
1607             tz               = _mm_mul_pd(fscal,dz11);
1608
1609             /* Update vectorial force */
1610             fix1             = _mm_add_pd(fix1,tx);
1611             fiy1             = _mm_add_pd(fiy1,ty);
1612             fiz1             = _mm_add_pd(fiz1,tz);
1613
1614             fjx1             = _mm_add_pd(fjx1,tx);
1615             fjy1             = _mm_add_pd(fjy1,ty);
1616             fjz1             = _mm_add_pd(fjz1,tz);
1617
1618             /**************************
1619              * CALCULATE INTERACTIONS *
1620              **************************/
1621
1622             /* REACTION-FIELD ELECTROSTATICS */
1623             felec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1624
1625             fscal            = felec;
1626
1627             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1628
1629             /* Calculate temporary vectorial force */
1630             tx               = _mm_mul_pd(fscal,dx12);
1631             ty               = _mm_mul_pd(fscal,dy12);
1632             tz               = _mm_mul_pd(fscal,dz12);
1633
1634             /* Update vectorial force */
1635             fix1             = _mm_add_pd(fix1,tx);
1636             fiy1             = _mm_add_pd(fiy1,ty);
1637             fiz1             = _mm_add_pd(fiz1,tz);
1638
1639             fjx2             = _mm_add_pd(fjx2,tx);
1640             fjy2             = _mm_add_pd(fjy2,ty);
1641             fjz2             = _mm_add_pd(fjz2,tz);
1642
1643             /**************************
1644              * CALCULATE INTERACTIONS *
1645              **************************/
1646
1647             /* REACTION-FIELD ELECTROSTATICS */
1648             felec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1649
1650             fscal            = felec;
1651
1652             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1653
1654             /* Calculate temporary vectorial force */
1655             tx               = _mm_mul_pd(fscal,dx13);
1656             ty               = _mm_mul_pd(fscal,dy13);
1657             tz               = _mm_mul_pd(fscal,dz13);
1658
1659             /* Update vectorial force */
1660             fix1             = _mm_add_pd(fix1,tx);
1661             fiy1             = _mm_add_pd(fiy1,ty);
1662             fiz1             = _mm_add_pd(fiz1,tz);
1663
1664             fjx3             = _mm_add_pd(fjx3,tx);
1665             fjy3             = _mm_add_pd(fjy3,ty);
1666             fjz3             = _mm_add_pd(fjz3,tz);
1667
1668             /**************************
1669              * CALCULATE INTERACTIONS *
1670              **************************/
1671
1672             /* REACTION-FIELD ELECTROSTATICS */
1673             felec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1674
1675             fscal            = felec;
1676
1677             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1678
1679             /* Calculate temporary vectorial force */
1680             tx               = _mm_mul_pd(fscal,dx21);
1681             ty               = _mm_mul_pd(fscal,dy21);
1682             tz               = _mm_mul_pd(fscal,dz21);
1683
1684             /* Update vectorial force */
1685             fix2             = _mm_add_pd(fix2,tx);
1686             fiy2             = _mm_add_pd(fiy2,ty);
1687             fiz2             = _mm_add_pd(fiz2,tz);
1688
1689             fjx1             = _mm_add_pd(fjx1,tx);
1690             fjy1             = _mm_add_pd(fjy1,ty);
1691             fjz1             = _mm_add_pd(fjz1,tz);
1692
1693             /**************************
1694              * CALCULATE INTERACTIONS *
1695              **************************/
1696
1697             /* REACTION-FIELD ELECTROSTATICS */
1698             felec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1699
1700             fscal            = felec;
1701
1702             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1703
1704             /* Calculate temporary vectorial force */
1705             tx               = _mm_mul_pd(fscal,dx22);
1706             ty               = _mm_mul_pd(fscal,dy22);
1707             tz               = _mm_mul_pd(fscal,dz22);
1708
1709             /* Update vectorial force */
1710             fix2             = _mm_add_pd(fix2,tx);
1711             fiy2             = _mm_add_pd(fiy2,ty);
1712             fiz2             = _mm_add_pd(fiz2,tz);
1713
1714             fjx2             = _mm_add_pd(fjx2,tx);
1715             fjy2             = _mm_add_pd(fjy2,ty);
1716             fjz2             = _mm_add_pd(fjz2,tz);
1717
1718             /**************************
1719              * CALCULATE INTERACTIONS *
1720              **************************/
1721
1722             /* REACTION-FIELD ELECTROSTATICS */
1723             felec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1724
1725             fscal            = felec;
1726
1727             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1728
1729             /* Calculate temporary vectorial force */
1730             tx               = _mm_mul_pd(fscal,dx23);
1731             ty               = _mm_mul_pd(fscal,dy23);
1732             tz               = _mm_mul_pd(fscal,dz23);
1733
1734             /* Update vectorial force */
1735             fix2             = _mm_add_pd(fix2,tx);
1736             fiy2             = _mm_add_pd(fiy2,ty);
1737             fiz2             = _mm_add_pd(fiz2,tz);
1738
1739             fjx3             = _mm_add_pd(fjx3,tx);
1740             fjy3             = _mm_add_pd(fjy3,ty);
1741             fjz3             = _mm_add_pd(fjz3,tz);
1742
1743             /**************************
1744              * CALCULATE INTERACTIONS *
1745              **************************/
1746
1747             /* REACTION-FIELD ELECTROSTATICS */
1748             felec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1749
1750             fscal            = felec;
1751
1752             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1753
1754             /* Calculate temporary vectorial force */
1755             tx               = _mm_mul_pd(fscal,dx31);
1756             ty               = _mm_mul_pd(fscal,dy31);
1757             tz               = _mm_mul_pd(fscal,dz31);
1758
1759             /* Update vectorial force */
1760             fix3             = _mm_add_pd(fix3,tx);
1761             fiy3             = _mm_add_pd(fiy3,ty);
1762             fiz3             = _mm_add_pd(fiz3,tz);
1763
1764             fjx1             = _mm_add_pd(fjx1,tx);
1765             fjy1             = _mm_add_pd(fjy1,ty);
1766             fjz1             = _mm_add_pd(fjz1,tz);
1767
1768             /**************************
1769              * CALCULATE INTERACTIONS *
1770              **************************/
1771
1772             /* REACTION-FIELD ELECTROSTATICS */
1773             felec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1774
1775             fscal            = felec;
1776
1777             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1778
1779             /* Calculate temporary vectorial force */
1780             tx               = _mm_mul_pd(fscal,dx32);
1781             ty               = _mm_mul_pd(fscal,dy32);
1782             tz               = _mm_mul_pd(fscal,dz32);
1783
1784             /* Update vectorial force */
1785             fix3             = _mm_add_pd(fix3,tx);
1786             fiy3             = _mm_add_pd(fiy3,ty);
1787             fiz3             = _mm_add_pd(fiz3,tz);
1788
1789             fjx2             = _mm_add_pd(fjx2,tx);
1790             fjy2             = _mm_add_pd(fjy2,ty);
1791             fjz2             = _mm_add_pd(fjz2,tz);
1792
1793             /**************************
1794              * CALCULATE INTERACTIONS *
1795              **************************/
1796
1797             /* REACTION-FIELD ELECTROSTATICS */
1798             felec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1799
1800             fscal            = felec;
1801
1802             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1803
1804             /* Calculate temporary vectorial force */
1805             tx               = _mm_mul_pd(fscal,dx33);
1806             ty               = _mm_mul_pd(fscal,dy33);
1807             tz               = _mm_mul_pd(fscal,dz33);
1808
1809             /* Update vectorial force */
1810             fix3             = _mm_add_pd(fix3,tx);
1811             fiy3             = _mm_add_pd(fiy3,ty);
1812             fiz3             = _mm_add_pd(fiz3,tz);
1813
1814             fjx3             = _mm_add_pd(fjx3,tx);
1815             fjy3             = _mm_add_pd(fjy3,ty);
1816             fjz3             = _mm_add_pd(fjz3,tz);
1817
1818             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1819
1820             /* Inner loop uses 273 flops */
1821         }
1822
1823         /* End of innermost loop */
1824
1825         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1826                                               f+i_coord_offset,fshift+i_shift_offset);
1827
1828         /* Increment number of inner iterations */
1829         inneriter                  += j_index_end - j_index_start;
1830
1831         /* Outer loop uses 24 flops */
1832     }
1833
1834     /* Increment number of outer iterations */
1835     outeriter        += nri;
1836
1837     /* Update outer/inner flops */
1838
1839     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);
1840 }