Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRF_VdwLJ_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse2_single.h"
50 #include "kernelutil_x86_sse2_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
113     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
114     __m128           dummy_mask,cutoff_mask;
115     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
116     __m128           one     = _mm_set1_ps(1.0);
117     __m128           two     = _mm_set1_ps(2.0);
118     x                = xx[0];
119     f                = ff[0];
120
121     nri              = nlist->nri;
122     iinr             = nlist->iinr;
123     jindex           = nlist->jindex;
124     jjnr             = nlist->jjnr;
125     shiftidx         = nlist->shift;
126     gid              = nlist->gid;
127     shiftvec         = fr->shift_vec[0];
128     fshift           = fr->fshift[0];
129     facel            = _mm_set1_ps(fr->epsfac);
130     charge           = mdatoms->chargeA;
131     krf              = _mm_set1_ps(fr->ic->k_rf);
132     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
133     crf              = _mm_set1_ps(fr->ic->c_rf);
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     /* Setup water-specific parameters */
139     inr              = nlist->iinr[0];
140     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
141     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
142     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
143     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
144
145     jq0              = _mm_set1_ps(charge[inr+0]);
146     jq1              = _mm_set1_ps(charge[inr+1]);
147     jq2              = _mm_set1_ps(charge[inr+2]);
148     vdwjidx0A        = 2*vdwtype[inr+0];
149     qq00             = _mm_mul_ps(iq0,jq0);
150     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
151     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
152     qq01             = _mm_mul_ps(iq0,jq1);
153     qq02             = _mm_mul_ps(iq0,jq2);
154     qq10             = _mm_mul_ps(iq1,jq0);
155     qq11             = _mm_mul_ps(iq1,jq1);
156     qq12             = _mm_mul_ps(iq1,jq2);
157     qq20             = _mm_mul_ps(iq2,jq0);
158     qq21             = _mm_mul_ps(iq2,jq1);
159     qq22             = _mm_mul_ps(iq2,jq2);
160
161     /* Avoid stupid compiler warnings */
162     jnrA = jnrB = jnrC = jnrD = 0;
163     j_coord_offsetA = 0;
164     j_coord_offsetB = 0;
165     j_coord_offsetC = 0;
166     j_coord_offsetD = 0;
167
168     outeriter        = 0;
169     inneriter        = 0;
170
171     for(iidx=0;iidx<4*DIM;iidx++)
172     {
173         scratch[iidx] = 0.0;
174     }  
175
176     /* Start outer loop over neighborlists */
177     for(iidx=0; iidx<nri; iidx++)
178     {
179         /* Load shift vector for this list */
180         i_shift_offset   = DIM*shiftidx[iidx];
181
182         /* Load limits for loop over neighbors */
183         j_index_start    = jindex[iidx];
184         j_index_end      = jindex[iidx+1];
185
186         /* Get outer coordinate index */
187         inr              = iinr[iidx];
188         i_coord_offset   = DIM*inr;
189
190         /* Load i particle coords and add shift vector */
191         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
192                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
193         
194         fix0             = _mm_setzero_ps();
195         fiy0             = _mm_setzero_ps();
196         fiz0             = _mm_setzero_ps();
197         fix1             = _mm_setzero_ps();
198         fiy1             = _mm_setzero_ps();
199         fiz1             = _mm_setzero_ps();
200         fix2             = _mm_setzero_ps();
201         fiy2             = _mm_setzero_ps();
202         fiz2             = _mm_setzero_ps();
203
204         /* Reset potential sums */
205         velecsum         = _mm_setzero_ps();
206         vvdwsum          = _mm_setzero_ps();
207
208         /* Start inner kernel loop */
209         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
210         {
211
212             /* Get j neighbor index, and coordinate index */
213             jnrA             = jjnr[jidx];
214             jnrB             = jjnr[jidx+1];
215             jnrC             = jjnr[jidx+2];
216             jnrD             = jjnr[jidx+3];
217             j_coord_offsetA  = DIM*jnrA;
218             j_coord_offsetB  = DIM*jnrB;
219             j_coord_offsetC  = DIM*jnrC;
220             j_coord_offsetD  = DIM*jnrD;
221
222             /* load j atom coordinates */
223             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
224                                               x+j_coord_offsetC,x+j_coord_offsetD,
225                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
226
227             /* Calculate displacement vector */
228             dx00             = _mm_sub_ps(ix0,jx0);
229             dy00             = _mm_sub_ps(iy0,jy0);
230             dz00             = _mm_sub_ps(iz0,jz0);
231             dx01             = _mm_sub_ps(ix0,jx1);
232             dy01             = _mm_sub_ps(iy0,jy1);
233             dz01             = _mm_sub_ps(iz0,jz1);
234             dx02             = _mm_sub_ps(ix0,jx2);
235             dy02             = _mm_sub_ps(iy0,jy2);
236             dz02             = _mm_sub_ps(iz0,jz2);
237             dx10             = _mm_sub_ps(ix1,jx0);
238             dy10             = _mm_sub_ps(iy1,jy0);
239             dz10             = _mm_sub_ps(iz1,jz0);
240             dx11             = _mm_sub_ps(ix1,jx1);
241             dy11             = _mm_sub_ps(iy1,jy1);
242             dz11             = _mm_sub_ps(iz1,jz1);
243             dx12             = _mm_sub_ps(ix1,jx2);
244             dy12             = _mm_sub_ps(iy1,jy2);
245             dz12             = _mm_sub_ps(iz1,jz2);
246             dx20             = _mm_sub_ps(ix2,jx0);
247             dy20             = _mm_sub_ps(iy2,jy0);
248             dz20             = _mm_sub_ps(iz2,jz0);
249             dx21             = _mm_sub_ps(ix2,jx1);
250             dy21             = _mm_sub_ps(iy2,jy1);
251             dz21             = _mm_sub_ps(iz2,jz1);
252             dx22             = _mm_sub_ps(ix2,jx2);
253             dy22             = _mm_sub_ps(iy2,jy2);
254             dz22             = _mm_sub_ps(iz2,jz2);
255
256             /* Calculate squared distance and things based on it */
257             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
258             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
259             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
260             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
261             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
262             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
263             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
264             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
265             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
266
267             rinv00           = gmx_mm_invsqrt_ps(rsq00);
268             rinv01           = gmx_mm_invsqrt_ps(rsq01);
269             rinv02           = gmx_mm_invsqrt_ps(rsq02);
270             rinv10           = gmx_mm_invsqrt_ps(rsq10);
271             rinv11           = gmx_mm_invsqrt_ps(rsq11);
272             rinv12           = gmx_mm_invsqrt_ps(rsq12);
273             rinv20           = gmx_mm_invsqrt_ps(rsq20);
274             rinv21           = gmx_mm_invsqrt_ps(rsq21);
275             rinv22           = gmx_mm_invsqrt_ps(rsq22);
276
277             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
278             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
279             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
280             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
281             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
282             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
283             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
284             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
285             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
286
287             fjx0             = _mm_setzero_ps();
288             fjy0             = _mm_setzero_ps();
289             fjz0             = _mm_setzero_ps();
290             fjx1             = _mm_setzero_ps();
291             fjy1             = _mm_setzero_ps();
292             fjz1             = _mm_setzero_ps();
293             fjx2             = _mm_setzero_ps();
294             fjy2             = _mm_setzero_ps();
295             fjz2             = _mm_setzero_ps();
296
297             /**************************
298              * CALCULATE INTERACTIONS *
299              **************************/
300
301             /* REACTION-FIELD ELECTROSTATICS */
302             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
303             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
304
305             /* LENNARD-JONES DISPERSION/REPULSION */
306
307             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
308             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
309             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
310             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
311             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
312
313             /* Update potential sum for this i atom from the interaction with this j atom. */
314             velecsum         = _mm_add_ps(velecsum,velec);
315             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
316
317             fscal            = _mm_add_ps(felec,fvdw);
318
319             /* Calculate temporary vectorial force */
320             tx               = _mm_mul_ps(fscal,dx00);
321             ty               = _mm_mul_ps(fscal,dy00);
322             tz               = _mm_mul_ps(fscal,dz00);
323
324             /* Update vectorial force */
325             fix0             = _mm_add_ps(fix0,tx);
326             fiy0             = _mm_add_ps(fiy0,ty);
327             fiz0             = _mm_add_ps(fiz0,tz);
328
329             fjx0             = _mm_add_ps(fjx0,tx);
330             fjy0             = _mm_add_ps(fjy0,ty);
331             fjz0             = _mm_add_ps(fjz0,tz);
332             
333             /**************************
334              * CALCULATE INTERACTIONS *
335              **************************/
336
337             /* REACTION-FIELD ELECTROSTATICS */
338             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
339             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
340
341             /* Update potential sum for this i atom from the interaction with this j atom. */
342             velecsum         = _mm_add_ps(velecsum,velec);
343
344             fscal            = felec;
345
346             /* Calculate temporary vectorial force */
347             tx               = _mm_mul_ps(fscal,dx01);
348             ty               = _mm_mul_ps(fscal,dy01);
349             tz               = _mm_mul_ps(fscal,dz01);
350
351             /* Update vectorial force */
352             fix0             = _mm_add_ps(fix0,tx);
353             fiy0             = _mm_add_ps(fiy0,ty);
354             fiz0             = _mm_add_ps(fiz0,tz);
355
356             fjx1             = _mm_add_ps(fjx1,tx);
357             fjy1             = _mm_add_ps(fjy1,ty);
358             fjz1             = _mm_add_ps(fjz1,tz);
359             
360             /**************************
361              * CALCULATE INTERACTIONS *
362              **************************/
363
364             /* REACTION-FIELD ELECTROSTATICS */
365             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
366             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
367
368             /* Update potential sum for this i atom from the interaction with this j atom. */
369             velecsum         = _mm_add_ps(velecsum,velec);
370
371             fscal            = felec;
372
373             /* Calculate temporary vectorial force */
374             tx               = _mm_mul_ps(fscal,dx02);
375             ty               = _mm_mul_ps(fscal,dy02);
376             tz               = _mm_mul_ps(fscal,dz02);
377
378             /* Update vectorial force */
379             fix0             = _mm_add_ps(fix0,tx);
380             fiy0             = _mm_add_ps(fiy0,ty);
381             fiz0             = _mm_add_ps(fiz0,tz);
382
383             fjx2             = _mm_add_ps(fjx2,tx);
384             fjy2             = _mm_add_ps(fjy2,ty);
385             fjz2             = _mm_add_ps(fjz2,tz);
386             
387             /**************************
388              * CALCULATE INTERACTIONS *
389              **************************/
390
391             /* REACTION-FIELD ELECTROSTATICS */
392             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
393             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
394
395             /* Update potential sum for this i atom from the interaction with this j atom. */
396             velecsum         = _mm_add_ps(velecsum,velec);
397
398             fscal            = felec;
399
400             /* Calculate temporary vectorial force */
401             tx               = _mm_mul_ps(fscal,dx10);
402             ty               = _mm_mul_ps(fscal,dy10);
403             tz               = _mm_mul_ps(fscal,dz10);
404
405             /* Update vectorial force */
406             fix1             = _mm_add_ps(fix1,tx);
407             fiy1             = _mm_add_ps(fiy1,ty);
408             fiz1             = _mm_add_ps(fiz1,tz);
409
410             fjx0             = _mm_add_ps(fjx0,tx);
411             fjy0             = _mm_add_ps(fjy0,ty);
412             fjz0             = _mm_add_ps(fjz0,tz);
413             
414             /**************************
415              * CALCULATE INTERACTIONS *
416              **************************/
417
418             /* REACTION-FIELD ELECTROSTATICS */
419             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
420             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
421
422             /* Update potential sum for this i atom from the interaction with this j atom. */
423             velecsum         = _mm_add_ps(velecsum,velec);
424
425             fscal            = felec;
426
427             /* Calculate temporary vectorial force */
428             tx               = _mm_mul_ps(fscal,dx11);
429             ty               = _mm_mul_ps(fscal,dy11);
430             tz               = _mm_mul_ps(fscal,dz11);
431
432             /* Update vectorial force */
433             fix1             = _mm_add_ps(fix1,tx);
434             fiy1             = _mm_add_ps(fiy1,ty);
435             fiz1             = _mm_add_ps(fiz1,tz);
436
437             fjx1             = _mm_add_ps(fjx1,tx);
438             fjy1             = _mm_add_ps(fjy1,ty);
439             fjz1             = _mm_add_ps(fjz1,tz);
440             
441             /**************************
442              * CALCULATE INTERACTIONS *
443              **************************/
444
445             /* REACTION-FIELD ELECTROSTATICS */
446             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
447             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
448
449             /* Update potential sum for this i atom from the interaction with this j atom. */
450             velecsum         = _mm_add_ps(velecsum,velec);
451
452             fscal            = felec;
453
454             /* Calculate temporary vectorial force */
455             tx               = _mm_mul_ps(fscal,dx12);
456             ty               = _mm_mul_ps(fscal,dy12);
457             tz               = _mm_mul_ps(fscal,dz12);
458
459             /* Update vectorial force */
460             fix1             = _mm_add_ps(fix1,tx);
461             fiy1             = _mm_add_ps(fiy1,ty);
462             fiz1             = _mm_add_ps(fiz1,tz);
463
464             fjx2             = _mm_add_ps(fjx2,tx);
465             fjy2             = _mm_add_ps(fjy2,ty);
466             fjz2             = _mm_add_ps(fjz2,tz);
467             
468             /**************************
469              * CALCULATE INTERACTIONS *
470              **************************/
471
472             /* REACTION-FIELD ELECTROSTATICS */
473             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
474             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
475
476             /* Update potential sum for this i atom from the interaction with this j atom. */
477             velecsum         = _mm_add_ps(velecsum,velec);
478
479             fscal            = felec;
480
481             /* Calculate temporary vectorial force */
482             tx               = _mm_mul_ps(fscal,dx20);
483             ty               = _mm_mul_ps(fscal,dy20);
484             tz               = _mm_mul_ps(fscal,dz20);
485
486             /* Update vectorial force */
487             fix2             = _mm_add_ps(fix2,tx);
488             fiy2             = _mm_add_ps(fiy2,ty);
489             fiz2             = _mm_add_ps(fiz2,tz);
490
491             fjx0             = _mm_add_ps(fjx0,tx);
492             fjy0             = _mm_add_ps(fjy0,ty);
493             fjz0             = _mm_add_ps(fjz0,tz);
494             
495             /**************************
496              * CALCULATE INTERACTIONS *
497              **************************/
498
499             /* REACTION-FIELD ELECTROSTATICS */
500             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
501             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
502
503             /* Update potential sum for this i atom from the interaction with this j atom. */
504             velecsum         = _mm_add_ps(velecsum,velec);
505
506             fscal            = felec;
507
508             /* Calculate temporary vectorial force */
509             tx               = _mm_mul_ps(fscal,dx21);
510             ty               = _mm_mul_ps(fscal,dy21);
511             tz               = _mm_mul_ps(fscal,dz21);
512
513             /* Update vectorial force */
514             fix2             = _mm_add_ps(fix2,tx);
515             fiy2             = _mm_add_ps(fiy2,ty);
516             fiz2             = _mm_add_ps(fiz2,tz);
517
518             fjx1             = _mm_add_ps(fjx1,tx);
519             fjy1             = _mm_add_ps(fjy1,ty);
520             fjz1             = _mm_add_ps(fjz1,tz);
521             
522             /**************************
523              * CALCULATE INTERACTIONS *
524              **************************/
525
526             /* REACTION-FIELD ELECTROSTATICS */
527             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
528             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
529
530             /* Update potential sum for this i atom from the interaction with this j atom. */
531             velecsum         = _mm_add_ps(velecsum,velec);
532
533             fscal            = felec;
534
535             /* Calculate temporary vectorial force */
536             tx               = _mm_mul_ps(fscal,dx22);
537             ty               = _mm_mul_ps(fscal,dy22);
538             tz               = _mm_mul_ps(fscal,dz22);
539
540             /* Update vectorial force */
541             fix2             = _mm_add_ps(fix2,tx);
542             fiy2             = _mm_add_ps(fiy2,ty);
543             fiz2             = _mm_add_ps(fiz2,tz);
544
545             fjx2             = _mm_add_ps(fjx2,tx);
546             fjy2             = _mm_add_ps(fjy2,ty);
547             fjz2             = _mm_add_ps(fjz2,tz);
548             
549             fjptrA             = f+j_coord_offsetA;
550             fjptrB             = f+j_coord_offsetB;
551             fjptrC             = f+j_coord_offsetC;
552             fjptrD             = f+j_coord_offsetD;
553
554             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
555                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
556
557             /* Inner loop uses 300 flops */
558         }
559
560         if(jidx<j_index_end)
561         {
562
563             /* Get j neighbor index, and coordinate index */
564             jnrlistA         = jjnr[jidx];
565             jnrlistB         = jjnr[jidx+1];
566             jnrlistC         = jjnr[jidx+2];
567             jnrlistD         = jjnr[jidx+3];
568             /* Sign of each element will be negative for non-real atoms.
569              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
570              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
571              */
572             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
573             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
574             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
575             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
576             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
577             j_coord_offsetA  = DIM*jnrA;
578             j_coord_offsetB  = DIM*jnrB;
579             j_coord_offsetC  = DIM*jnrC;
580             j_coord_offsetD  = DIM*jnrD;
581
582             /* load j atom coordinates */
583             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
584                                               x+j_coord_offsetC,x+j_coord_offsetD,
585                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
586
587             /* Calculate displacement vector */
588             dx00             = _mm_sub_ps(ix0,jx0);
589             dy00             = _mm_sub_ps(iy0,jy0);
590             dz00             = _mm_sub_ps(iz0,jz0);
591             dx01             = _mm_sub_ps(ix0,jx1);
592             dy01             = _mm_sub_ps(iy0,jy1);
593             dz01             = _mm_sub_ps(iz0,jz1);
594             dx02             = _mm_sub_ps(ix0,jx2);
595             dy02             = _mm_sub_ps(iy0,jy2);
596             dz02             = _mm_sub_ps(iz0,jz2);
597             dx10             = _mm_sub_ps(ix1,jx0);
598             dy10             = _mm_sub_ps(iy1,jy0);
599             dz10             = _mm_sub_ps(iz1,jz0);
600             dx11             = _mm_sub_ps(ix1,jx1);
601             dy11             = _mm_sub_ps(iy1,jy1);
602             dz11             = _mm_sub_ps(iz1,jz1);
603             dx12             = _mm_sub_ps(ix1,jx2);
604             dy12             = _mm_sub_ps(iy1,jy2);
605             dz12             = _mm_sub_ps(iz1,jz2);
606             dx20             = _mm_sub_ps(ix2,jx0);
607             dy20             = _mm_sub_ps(iy2,jy0);
608             dz20             = _mm_sub_ps(iz2,jz0);
609             dx21             = _mm_sub_ps(ix2,jx1);
610             dy21             = _mm_sub_ps(iy2,jy1);
611             dz21             = _mm_sub_ps(iz2,jz1);
612             dx22             = _mm_sub_ps(ix2,jx2);
613             dy22             = _mm_sub_ps(iy2,jy2);
614             dz22             = _mm_sub_ps(iz2,jz2);
615
616             /* Calculate squared distance and things based on it */
617             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
618             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
619             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
620             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
621             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
622             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
623             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
624             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
625             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
626
627             rinv00           = gmx_mm_invsqrt_ps(rsq00);
628             rinv01           = gmx_mm_invsqrt_ps(rsq01);
629             rinv02           = gmx_mm_invsqrt_ps(rsq02);
630             rinv10           = gmx_mm_invsqrt_ps(rsq10);
631             rinv11           = gmx_mm_invsqrt_ps(rsq11);
632             rinv12           = gmx_mm_invsqrt_ps(rsq12);
633             rinv20           = gmx_mm_invsqrt_ps(rsq20);
634             rinv21           = gmx_mm_invsqrt_ps(rsq21);
635             rinv22           = gmx_mm_invsqrt_ps(rsq22);
636
637             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
638             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
639             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
640             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
641             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
642             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
643             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
644             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
645             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
646
647             fjx0             = _mm_setzero_ps();
648             fjy0             = _mm_setzero_ps();
649             fjz0             = _mm_setzero_ps();
650             fjx1             = _mm_setzero_ps();
651             fjy1             = _mm_setzero_ps();
652             fjz1             = _mm_setzero_ps();
653             fjx2             = _mm_setzero_ps();
654             fjy2             = _mm_setzero_ps();
655             fjz2             = _mm_setzero_ps();
656
657             /**************************
658              * CALCULATE INTERACTIONS *
659              **************************/
660
661             /* REACTION-FIELD ELECTROSTATICS */
662             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
663             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
664
665             /* LENNARD-JONES DISPERSION/REPULSION */
666
667             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
668             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
669             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
670             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
671             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
672
673             /* Update potential sum for this i atom from the interaction with this j atom. */
674             velec            = _mm_andnot_ps(dummy_mask,velec);
675             velecsum         = _mm_add_ps(velecsum,velec);
676             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
677             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
678
679             fscal            = _mm_add_ps(felec,fvdw);
680
681             fscal            = _mm_andnot_ps(dummy_mask,fscal);
682
683             /* Calculate temporary vectorial force */
684             tx               = _mm_mul_ps(fscal,dx00);
685             ty               = _mm_mul_ps(fscal,dy00);
686             tz               = _mm_mul_ps(fscal,dz00);
687
688             /* Update vectorial force */
689             fix0             = _mm_add_ps(fix0,tx);
690             fiy0             = _mm_add_ps(fiy0,ty);
691             fiz0             = _mm_add_ps(fiz0,tz);
692
693             fjx0             = _mm_add_ps(fjx0,tx);
694             fjy0             = _mm_add_ps(fjy0,ty);
695             fjz0             = _mm_add_ps(fjz0,tz);
696             
697             /**************************
698              * CALCULATE INTERACTIONS *
699              **************************/
700
701             /* REACTION-FIELD ELECTROSTATICS */
702             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
703             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
704
705             /* Update potential sum for this i atom from the interaction with this j atom. */
706             velec            = _mm_andnot_ps(dummy_mask,velec);
707             velecsum         = _mm_add_ps(velecsum,velec);
708
709             fscal            = felec;
710
711             fscal            = _mm_andnot_ps(dummy_mask,fscal);
712
713             /* Calculate temporary vectorial force */
714             tx               = _mm_mul_ps(fscal,dx01);
715             ty               = _mm_mul_ps(fscal,dy01);
716             tz               = _mm_mul_ps(fscal,dz01);
717
718             /* Update vectorial force */
719             fix0             = _mm_add_ps(fix0,tx);
720             fiy0             = _mm_add_ps(fiy0,ty);
721             fiz0             = _mm_add_ps(fiz0,tz);
722
723             fjx1             = _mm_add_ps(fjx1,tx);
724             fjy1             = _mm_add_ps(fjy1,ty);
725             fjz1             = _mm_add_ps(fjz1,tz);
726             
727             /**************************
728              * CALCULATE INTERACTIONS *
729              **************************/
730
731             /* REACTION-FIELD ELECTROSTATICS */
732             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
733             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
734
735             /* Update potential sum for this i atom from the interaction with this j atom. */
736             velec            = _mm_andnot_ps(dummy_mask,velec);
737             velecsum         = _mm_add_ps(velecsum,velec);
738
739             fscal            = felec;
740
741             fscal            = _mm_andnot_ps(dummy_mask,fscal);
742
743             /* Calculate temporary vectorial force */
744             tx               = _mm_mul_ps(fscal,dx02);
745             ty               = _mm_mul_ps(fscal,dy02);
746             tz               = _mm_mul_ps(fscal,dz02);
747
748             /* Update vectorial force */
749             fix0             = _mm_add_ps(fix0,tx);
750             fiy0             = _mm_add_ps(fiy0,ty);
751             fiz0             = _mm_add_ps(fiz0,tz);
752
753             fjx2             = _mm_add_ps(fjx2,tx);
754             fjy2             = _mm_add_ps(fjy2,ty);
755             fjz2             = _mm_add_ps(fjz2,tz);
756             
757             /**************************
758              * CALCULATE INTERACTIONS *
759              **************************/
760
761             /* REACTION-FIELD ELECTROSTATICS */
762             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
763             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
764
765             /* Update potential sum for this i atom from the interaction with this j atom. */
766             velec            = _mm_andnot_ps(dummy_mask,velec);
767             velecsum         = _mm_add_ps(velecsum,velec);
768
769             fscal            = felec;
770
771             fscal            = _mm_andnot_ps(dummy_mask,fscal);
772
773             /* Calculate temporary vectorial force */
774             tx               = _mm_mul_ps(fscal,dx10);
775             ty               = _mm_mul_ps(fscal,dy10);
776             tz               = _mm_mul_ps(fscal,dz10);
777
778             /* Update vectorial force */
779             fix1             = _mm_add_ps(fix1,tx);
780             fiy1             = _mm_add_ps(fiy1,ty);
781             fiz1             = _mm_add_ps(fiz1,tz);
782
783             fjx0             = _mm_add_ps(fjx0,tx);
784             fjy0             = _mm_add_ps(fjy0,ty);
785             fjz0             = _mm_add_ps(fjz0,tz);
786             
787             /**************************
788              * CALCULATE INTERACTIONS *
789              **************************/
790
791             /* REACTION-FIELD ELECTROSTATICS */
792             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
793             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
794
795             /* Update potential sum for this i atom from the interaction with this j atom. */
796             velec            = _mm_andnot_ps(dummy_mask,velec);
797             velecsum         = _mm_add_ps(velecsum,velec);
798
799             fscal            = felec;
800
801             fscal            = _mm_andnot_ps(dummy_mask,fscal);
802
803             /* Calculate temporary vectorial force */
804             tx               = _mm_mul_ps(fscal,dx11);
805             ty               = _mm_mul_ps(fscal,dy11);
806             tz               = _mm_mul_ps(fscal,dz11);
807
808             /* Update vectorial force */
809             fix1             = _mm_add_ps(fix1,tx);
810             fiy1             = _mm_add_ps(fiy1,ty);
811             fiz1             = _mm_add_ps(fiz1,tz);
812
813             fjx1             = _mm_add_ps(fjx1,tx);
814             fjy1             = _mm_add_ps(fjy1,ty);
815             fjz1             = _mm_add_ps(fjz1,tz);
816             
817             /**************************
818              * CALCULATE INTERACTIONS *
819              **************************/
820
821             /* REACTION-FIELD ELECTROSTATICS */
822             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
823             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
824
825             /* Update potential sum for this i atom from the interaction with this j atom. */
826             velec            = _mm_andnot_ps(dummy_mask,velec);
827             velecsum         = _mm_add_ps(velecsum,velec);
828
829             fscal            = felec;
830
831             fscal            = _mm_andnot_ps(dummy_mask,fscal);
832
833             /* Calculate temporary vectorial force */
834             tx               = _mm_mul_ps(fscal,dx12);
835             ty               = _mm_mul_ps(fscal,dy12);
836             tz               = _mm_mul_ps(fscal,dz12);
837
838             /* Update vectorial force */
839             fix1             = _mm_add_ps(fix1,tx);
840             fiy1             = _mm_add_ps(fiy1,ty);
841             fiz1             = _mm_add_ps(fiz1,tz);
842
843             fjx2             = _mm_add_ps(fjx2,tx);
844             fjy2             = _mm_add_ps(fjy2,ty);
845             fjz2             = _mm_add_ps(fjz2,tz);
846             
847             /**************************
848              * CALCULATE INTERACTIONS *
849              **************************/
850
851             /* REACTION-FIELD ELECTROSTATICS */
852             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
853             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
854
855             /* Update potential sum for this i atom from the interaction with this j atom. */
856             velec            = _mm_andnot_ps(dummy_mask,velec);
857             velecsum         = _mm_add_ps(velecsum,velec);
858
859             fscal            = felec;
860
861             fscal            = _mm_andnot_ps(dummy_mask,fscal);
862
863             /* Calculate temporary vectorial force */
864             tx               = _mm_mul_ps(fscal,dx20);
865             ty               = _mm_mul_ps(fscal,dy20);
866             tz               = _mm_mul_ps(fscal,dz20);
867
868             /* Update vectorial force */
869             fix2             = _mm_add_ps(fix2,tx);
870             fiy2             = _mm_add_ps(fiy2,ty);
871             fiz2             = _mm_add_ps(fiz2,tz);
872
873             fjx0             = _mm_add_ps(fjx0,tx);
874             fjy0             = _mm_add_ps(fjy0,ty);
875             fjz0             = _mm_add_ps(fjz0,tz);
876             
877             /**************************
878              * CALCULATE INTERACTIONS *
879              **************************/
880
881             /* REACTION-FIELD ELECTROSTATICS */
882             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
883             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
884
885             /* Update potential sum for this i atom from the interaction with this j atom. */
886             velec            = _mm_andnot_ps(dummy_mask,velec);
887             velecsum         = _mm_add_ps(velecsum,velec);
888
889             fscal            = felec;
890
891             fscal            = _mm_andnot_ps(dummy_mask,fscal);
892
893             /* Calculate temporary vectorial force */
894             tx               = _mm_mul_ps(fscal,dx21);
895             ty               = _mm_mul_ps(fscal,dy21);
896             tz               = _mm_mul_ps(fscal,dz21);
897
898             /* Update vectorial force */
899             fix2             = _mm_add_ps(fix2,tx);
900             fiy2             = _mm_add_ps(fiy2,ty);
901             fiz2             = _mm_add_ps(fiz2,tz);
902
903             fjx1             = _mm_add_ps(fjx1,tx);
904             fjy1             = _mm_add_ps(fjy1,ty);
905             fjz1             = _mm_add_ps(fjz1,tz);
906             
907             /**************************
908              * CALCULATE INTERACTIONS *
909              **************************/
910
911             /* REACTION-FIELD ELECTROSTATICS */
912             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
913             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
914
915             /* Update potential sum for this i atom from the interaction with this j atom. */
916             velec            = _mm_andnot_ps(dummy_mask,velec);
917             velecsum         = _mm_add_ps(velecsum,velec);
918
919             fscal            = felec;
920
921             fscal            = _mm_andnot_ps(dummy_mask,fscal);
922
923             /* Calculate temporary vectorial force */
924             tx               = _mm_mul_ps(fscal,dx22);
925             ty               = _mm_mul_ps(fscal,dy22);
926             tz               = _mm_mul_ps(fscal,dz22);
927
928             /* Update vectorial force */
929             fix2             = _mm_add_ps(fix2,tx);
930             fiy2             = _mm_add_ps(fiy2,ty);
931             fiz2             = _mm_add_ps(fiz2,tz);
932
933             fjx2             = _mm_add_ps(fjx2,tx);
934             fjy2             = _mm_add_ps(fjy2,ty);
935             fjz2             = _mm_add_ps(fjz2,tz);
936             
937             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
938             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
939             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
940             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
941
942             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
943                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
944
945             /* Inner loop uses 300 flops */
946         }
947
948         /* End of innermost loop */
949
950         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
951                                               f+i_coord_offset,fshift+i_shift_offset);
952
953         ggid                        = gid[iidx];
954         /* Update potential energies */
955         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
956         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
957
958         /* Increment number of inner iterations */
959         inneriter                  += j_index_end - j_index_start;
960
961         /* Outer loop uses 20 flops */
962     }
963
964     /* Increment number of outer iterations */
965     outeriter        += nri;
966
967     /* Update outer/inner flops */
968
969     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*300);
970 }
971 /*
972  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
973  * Electrostatics interaction: ReactionField
974  * VdW interaction:            LennardJones
975  * Geometry:                   Water3-Water3
976  * Calculate force/pot:        Force
977  */
978 void
979 nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
980                     (t_nblist                    * gmx_restrict       nlist,
981                      rvec                        * gmx_restrict          xx,
982                      rvec                        * gmx_restrict          ff,
983                      t_forcerec                  * gmx_restrict          fr,
984                      t_mdatoms                   * gmx_restrict     mdatoms,
985                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
986                      t_nrnb                      * gmx_restrict        nrnb)
987 {
988     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
989      * just 0 for non-waters.
990      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
991      * jnr indices corresponding to data put in the four positions in the SIMD register.
992      */
993     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
994     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
995     int              jnrA,jnrB,jnrC,jnrD;
996     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
997     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
998     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
999     real             rcutoff_scalar;
1000     real             *shiftvec,*fshift,*x,*f;
1001     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1002     real             scratch[4*DIM];
1003     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1004     int              vdwioffset0;
1005     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1006     int              vdwioffset1;
1007     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1008     int              vdwioffset2;
1009     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1010     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1011     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1012     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1013     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1014     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1015     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1016     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1017     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1018     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1019     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1020     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1021     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1022     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1023     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1024     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1025     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1026     real             *charge;
1027     int              nvdwtype;
1028     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1029     int              *vdwtype;
1030     real             *vdwparam;
1031     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1032     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1033     __m128           dummy_mask,cutoff_mask;
1034     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1035     __m128           one     = _mm_set1_ps(1.0);
1036     __m128           two     = _mm_set1_ps(2.0);
1037     x                = xx[0];
1038     f                = ff[0];
1039
1040     nri              = nlist->nri;
1041     iinr             = nlist->iinr;
1042     jindex           = nlist->jindex;
1043     jjnr             = nlist->jjnr;
1044     shiftidx         = nlist->shift;
1045     gid              = nlist->gid;
1046     shiftvec         = fr->shift_vec[0];
1047     fshift           = fr->fshift[0];
1048     facel            = _mm_set1_ps(fr->epsfac);
1049     charge           = mdatoms->chargeA;
1050     krf              = _mm_set1_ps(fr->ic->k_rf);
1051     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1052     crf              = _mm_set1_ps(fr->ic->c_rf);
1053     nvdwtype         = fr->ntype;
1054     vdwparam         = fr->nbfp;
1055     vdwtype          = mdatoms->typeA;
1056
1057     /* Setup water-specific parameters */
1058     inr              = nlist->iinr[0];
1059     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1060     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1061     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1062     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1063
1064     jq0              = _mm_set1_ps(charge[inr+0]);
1065     jq1              = _mm_set1_ps(charge[inr+1]);
1066     jq2              = _mm_set1_ps(charge[inr+2]);
1067     vdwjidx0A        = 2*vdwtype[inr+0];
1068     qq00             = _mm_mul_ps(iq0,jq0);
1069     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1070     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1071     qq01             = _mm_mul_ps(iq0,jq1);
1072     qq02             = _mm_mul_ps(iq0,jq2);
1073     qq10             = _mm_mul_ps(iq1,jq0);
1074     qq11             = _mm_mul_ps(iq1,jq1);
1075     qq12             = _mm_mul_ps(iq1,jq2);
1076     qq20             = _mm_mul_ps(iq2,jq0);
1077     qq21             = _mm_mul_ps(iq2,jq1);
1078     qq22             = _mm_mul_ps(iq2,jq2);
1079
1080     /* Avoid stupid compiler warnings */
1081     jnrA = jnrB = jnrC = jnrD = 0;
1082     j_coord_offsetA = 0;
1083     j_coord_offsetB = 0;
1084     j_coord_offsetC = 0;
1085     j_coord_offsetD = 0;
1086
1087     outeriter        = 0;
1088     inneriter        = 0;
1089
1090     for(iidx=0;iidx<4*DIM;iidx++)
1091     {
1092         scratch[iidx] = 0.0;
1093     }  
1094
1095     /* Start outer loop over neighborlists */
1096     for(iidx=0; iidx<nri; iidx++)
1097     {
1098         /* Load shift vector for this list */
1099         i_shift_offset   = DIM*shiftidx[iidx];
1100
1101         /* Load limits for loop over neighbors */
1102         j_index_start    = jindex[iidx];
1103         j_index_end      = jindex[iidx+1];
1104
1105         /* Get outer coordinate index */
1106         inr              = iinr[iidx];
1107         i_coord_offset   = DIM*inr;
1108
1109         /* Load i particle coords and add shift vector */
1110         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1111                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1112         
1113         fix0             = _mm_setzero_ps();
1114         fiy0             = _mm_setzero_ps();
1115         fiz0             = _mm_setzero_ps();
1116         fix1             = _mm_setzero_ps();
1117         fiy1             = _mm_setzero_ps();
1118         fiz1             = _mm_setzero_ps();
1119         fix2             = _mm_setzero_ps();
1120         fiy2             = _mm_setzero_ps();
1121         fiz2             = _mm_setzero_ps();
1122
1123         /* Start inner kernel loop */
1124         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1125         {
1126
1127             /* Get j neighbor index, and coordinate index */
1128             jnrA             = jjnr[jidx];
1129             jnrB             = jjnr[jidx+1];
1130             jnrC             = jjnr[jidx+2];
1131             jnrD             = jjnr[jidx+3];
1132             j_coord_offsetA  = DIM*jnrA;
1133             j_coord_offsetB  = DIM*jnrB;
1134             j_coord_offsetC  = DIM*jnrC;
1135             j_coord_offsetD  = DIM*jnrD;
1136
1137             /* load j atom coordinates */
1138             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1139                                               x+j_coord_offsetC,x+j_coord_offsetD,
1140                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1141
1142             /* Calculate displacement vector */
1143             dx00             = _mm_sub_ps(ix0,jx0);
1144             dy00             = _mm_sub_ps(iy0,jy0);
1145             dz00             = _mm_sub_ps(iz0,jz0);
1146             dx01             = _mm_sub_ps(ix0,jx1);
1147             dy01             = _mm_sub_ps(iy0,jy1);
1148             dz01             = _mm_sub_ps(iz0,jz1);
1149             dx02             = _mm_sub_ps(ix0,jx2);
1150             dy02             = _mm_sub_ps(iy0,jy2);
1151             dz02             = _mm_sub_ps(iz0,jz2);
1152             dx10             = _mm_sub_ps(ix1,jx0);
1153             dy10             = _mm_sub_ps(iy1,jy0);
1154             dz10             = _mm_sub_ps(iz1,jz0);
1155             dx11             = _mm_sub_ps(ix1,jx1);
1156             dy11             = _mm_sub_ps(iy1,jy1);
1157             dz11             = _mm_sub_ps(iz1,jz1);
1158             dx12             = _mm_sub_ps(ix1,jx2);
1159             dy12             = _mm_sub_ps(iy1,jy2);
1160             dz12             = _mm_sub_ps(iz1,jz2);
1161             dx20             = _mm_sub_ps(ix2,jx0);
1162             dy20             = _mm_sub_ps(iy2,jy0);
1163             dz20             = _mm_sub_ps(iz2,jz0);
1164             dx21             = _mm_sub_ps(ix2,jx1);
1165             dy21             = _mm_sub_ps(iy2,jy1);
1166             dz21             = _mm_sub_ps(iz2,jz1);
1167             dx22             = _mm_sub_ps(ix2,jx2);
1168             dy22             = _mm_sub_ps(iy2,jy2);
1169             dz22             = _mm_sub_ps(iz2,jz2);
1170
1171             /* Calculate squared distance and things based on it */
1172             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1173             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1174             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1175             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1176             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1177             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1178             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1179             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1180             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1181
1182             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1183             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1184             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1185             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1186             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1187             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1188             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1189             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1190             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1191
1192             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1193             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1194             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1195             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1196             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1197             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1198             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1199             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1200             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1201
1202             fjx0             = _mm_setzero_ps();
1203             fjy0             = _mm_setzero_ps();
1204             fjz0             = _mm_setzero_ps();
1205             fjx1             = _mm_setzero_ps();
1206             fjy1             = _mm_setzero_ps();
1207             fjz1             = _mm_setzero_ps();
1208             fjx2             = _mm_setzero_ps();
1209             fjy2             = _mm_setzero_ps();
1210             fjz2             = _mm_setzero_ps();
1211
1212             /**************************
1213              * CALCULATE INTERACTIONS *
1214              **************************/
1215
1216             /* REACTION-FIELD ELECTROSTATICS */
1217             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1218
1219             /* LENNARD-JONES DISPERSION/REPULSION */
1220
1221             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1222             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1223
1224             fscal            = _mm_add_ps(felec,fvdw);
1225
1226             /* Calculate temporary vectorial force */
1227             tx               = _mm_mul_ps(fscal,dx00);
1228             ty               = _mm_mul_ps(fscal,dy00);
1229             tz               = _mm_mul_ps(fscal,dz00);
1230
1231             /* Update vectorial force */
1232             fix0             = _mm_add_ps(fix0,tx);
1233             fiy0             = _mm_add_ps(fiy0,ty);
1234             fiz0             = _mm_add_ps(fiz0,tz);
1235
1236             fjx0             = _mm_add_ps(fjx0,tx);
1237             fjy0             = _mm_add_ps(fjy0,ty);
1238             fjz0             = _mm_add_ps(fjz0,tz);
1239             
1240             /**************************
1241              * CALCULATE INTERACTIONS *
1242              **************************/
1243
1244             /* REACTION-FIELD ELECTROSTATICS */
1245             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1246
1247             fscal            = felec;
1248
1249             /* Calculate temporary vectorial force */
1250             tx               = _mm_mul_ps(fscal,dx01);
1251             ty               = _mm_mul_ps(fscal,dy01);
1252             tz               = _mm_mul_ps(fscal,dz01);
1253
1254             /* Update vectorial force */
1255             fix0             = _mm_add_ps(fix0,tx);
1256             fiy0             = _mm_add_ps(fiy0,ty);
1257             fiz0             = _mm_add_ps(fiz0,tz);
1258
1259             fjx1             = _mm_add_ps(fjx1,tx);
1260             fjy1             = _mm_add_ps(fjy1,ty);
1261             fjz1             = _mm_add_ps(fjz1,tz);
1262             
1263             /**************************
1264              * CALCULATE INTERACTIONS *
1265              **************************/
1266
1267             /* REACTION-FIELD ELECTROSTATICS */
1268             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1269
1270             fscal            = felec;
1271
1272             /* Calculate temporary vectorial force */
1273             tx               = _mm_mul_ps(fscal,dx02);
1274             ty               = _mm_mul_ps(fscal,dy02);
1275             tz               = _mm_mul_ps(fscal,dz02);
1276
1277             /* Update vectorial force */
1278             fix0             = _mm_add_ps(fix0,tx);
1279             fiy0             = _mm_add_ps(fiy0,ty);
1280             fiz0             = _mm_add_ps(fiz0,tz);
1281
1282             fjx2             = _mm_add_ps(fjx2,tx);
1283             fjy2             = _mm_add_ps(fjy2,ty);
1284             fjz2             = _mm_add_ps(fjz2,tz);
1285             
1286             /**************************
1287              * CALCULATE INTERACTIONS *
1288              **************************/
1289
1290             /* REACTION-FIELD ELECTROSTATICS */
1291             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1292
1293             fscal            = felec;
1294
1295             /* Calculate temporary vectorial force */
1296             tx               = _mm_mul_ps(fscal,dx10);
1297             ty               = _mm_mul_ps(fscal,dy10);
1298             tz               = _mm_mul_ps(fscal,dz10);
1299
1300             /* Update vectorial force */
1301             fix1             = _mm_add_ps(fix1,tx);
1302             fiy1             = _mm_add_ps(fiy1,ty);
1303             fiz1             = _mm_add_ps(fiz1,tz);
1304
1305             fjx0             = _mm_add_ps(fjx0,tx);
1306             fjy0             = _mm_add_ps(fjy0,ty);
1307             fjz0             = _mm_add_ps(fjz0,tz);
1308             
1309             /**************************
1310              * CALCULATE INTERACTIONS *
1311              **************************/
1312
1313             /* REACTION-FIELD ELECTROSTATICS */
1314             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1315
1316             fscal            = felec;
1317
1318             /* Calculate temporary vectorial force */
1319             tx               = _mm_mul_ps(fscal,dx11);
1320             ty               = _mm_mul_ps(fscal,dy11);
1321             tz               = _mm_mul_ps(fscal,dz11);
1322
1323             /* Update vectorial force */
1324             fix1             = _mm_add_ps(fix1,tx);
1325             fiy1             = _mm_add_ps(fiy1,ty);
1326             fiz1             = _mm_add_ps(fiz1,tz);
1327
1328             fjx1             = _mm_add_ps(fjx1,tx);
1329             fjy1             = _mm_add_ps(fjy1,ty);
1330             fjz1             = _mm_add_ps(fjz1,tz);
1331             
1332             /**************************
1333              * CALCULATE INTERACTIONS *
1334              **************************/
1335
1336             /* REACTION-FIELD ELECTROSTATICS */
1337             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1338
1339             fscal            = felec;
1340
1341             /* Calculate temporary vectorial force */
1342             tx               = _mm_mul_ps(fscal,dx12);
1343             ty               = _mm_mul_ps(fscal,dy12);
1344             tz               = _mm_mul_ps(fscal,dz12);
1345
1346             /* Update vectorial force */
1347             fix1             = _mm_add_ps(fix1,tx);
1348             fiy1             = _mm_add_ps(fiy1,ty);
1349             fiz1             = _mm_add_ps(fiz1,tz);
1350
1351             fjx2             = _mm_add_ps(fjx2,tx);
1352             fjy2             = _mm_add_ps(fjy2,ty);
1353             fjz2             = _mm_add_ps(fjz2,tz);
1354             
1355             /**************************
1356              * CALCULATE INTERACTIONS *
1357              **************************/
1358
1359             /* REACTION-FIELD ELECTROSTATICS */
1360             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1361
1362             fscal            = felec;
1363
1364             /* Calculate temporary vectorial force */
1365             tx               = _mm_mul_ps(fscal,dx20);
1366             ty               = _mm_mul_ps(fscal,dy20);
1367             tz               = _mm_mul_ps(fscal,dz20);
1368
1369             /* Update vectorial force */
1370             fix2             = _mm_add_ps(fix2,tx);
1371             fiy2             = _mm_add_ps(fiy2,ty);
1372             fiz2             = _mm_add_ps(fiz2,tz);
1373
1374             fjx0             = _mm_add_ps(fjx0,tx);
1375             fjy0             = _mm_add_ps(fjy0,ty);
1376             fjz0             = _mm_add_ps(fjz0,tz);
1377             
1378             /**************************
1379              * CALCULATE INTERACTIONS *
1380              **************************/
1381
1382             /* REACTION-FIELD ELECTROSTATICS */
1383             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1384
1385             fscal            = felec;
1386
1387             /* Calculate temporary vectorial force */
1388             tx               = _mm_mul_ps(fscal,dx21);
1389             ty               = _mm_mul_ps(fscal,dy21);
1390             tz               = _mm_mul_ps(fscal,dz21);
1391
1392             /* Update vectorial force */
1393             fix2             = _mm_add_ps(fix2,tx);
1394             fiy2             = _mm_add_ps(fiy2,ty);
1395             fiz2             = _mm_add_ps(fiz2,tz);
1396
1397             fjx1             = _mm_add_ps(fjx1,tx);
1398             fjy1             = _mm_add_ps(fjy1,ty);
1399             fjz1             = _mm_add_ps(fjz1,tz);
1400             
1401             /**************************
1402              * CALCULATE INTERACTIONS *
1403              **************************/
1404
1405             /* REACTION-FIELD ELECTROSTATICS */
1406             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1407
1408             fscal            = felec;
1409
1410             /* Calculate temporary vectorial force */
1411             tx               = _mm_mul_ps(fscal,dx22);
1412             ty               = _mm_mul_ps(fscal,dy22);
1413             tz               = _mm_mul_ps(fscal,dz22);
1414
1415             /* Update vectorial force */
1416             fix2             = _mm_add_ps(fix2,tx);
1417             fiy2             = _mm_add_ps(fiy2,ty);
1418             fiz2             = _mm_add_ps(fiz2,tz);
1419
1420             fjx2             = _mm_add_ps(fjx2,tx);
1421             fjy2             = _mm_add_ps(fjy2,ty);
1422             fjz2             = _mm_add_ps(fjz2,tz);
1423             
1424             fjptrA             = f+j_coord_offsetA;
1425             fjptrB             = f+j_coord_offsetB;
1426             fjptrC             = f+j_coord_offsetC;
1427             fjptrD             = f+j_coord_offsetD;
1428
1429             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1430                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1431
1432             /* Inner loop uses 250 flops */
1433         }
1434
1435         if(jidx<j_index_end)
1436         {
1437
1438             /* Get j neighbor index, and coordinate index */
1439             jnrlistA         = jjnr[jidx];
1440             jnrlistB         = jjnr[jidx+1];
1441             jnrlistC         = jjnr[jidx+2];
1442             jnrlistD         = jjnr[jidx+3];
1443             /* Sign of each element will be negative for non-real atoms.
1444              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1445              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1446              */
1447             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1448             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1449             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1450             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1451             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1452             j_coord_offsetA  = DIM*jnrA;
1453             j_coord_offsetB  = DIM*jnrB;
1454             j_coord_offsetC  = DIM*jnrC;
1455             j_coord_offsetD  = DIM*jnrD;
1456
1457             /* load j atom coordinates */
1458             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1459                                               x+j_coord_offsetC,x+j_coord_offsetD,
1460                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1461
1462             /* Calculate displacement vector */
1463             dx00             = _mm_sub_ps(ix0,jx0);
1464             dy00             = _mm_sub_ps(iy0,jy0);
1465             dz00             = _mm_sub_ps(iz0,jz0);
1466             dx01             = _mm_sub_ps(ix0,jx1);
1467             dy01             = _mm_sub_ps(iy0,jy1);
1468             dz01             = _mm_sub_ps(iz0,jz1);
1469             dx02             = _mm_sub_ps(ix0,jx2);
1470             dy02             = _mm_sub_ps(iy0,jy2);
1471             dz02             = _mm_sub_ps(iz0,jz2);
1472             dx10             = _mm_sub_ps(ix1,jx0);
1473             dy10             = _mm_sub_ps(iy1,jy0);
1474             dz10             = _mm_sub_ps(iz1,jz0);
1475             dx11             = _mm_sub_ps(ix1,jx1);
1476             dy11             = _mm_sub_ps(iy1,jy1);
1477             dz11             = _mm_sub_ps(iz1,jz1);
1478             dx12             = _mm_sub_ps(ix1,jx2);
1479             dy12             = _mm_sub_ps(iy1,jy2);
1480             dz12             = _mm_sub_ps(iz1,jz2);
1481             dx20             = _mm_sub_ps(ix2,jx0);
1482             dy20             = _mm_sub_ps(iy2,jy0);
1483             dz20             = _mm_sub_ps(iz2,jz0);
1484             dx21             = _mm_sub_ps(ix2,jx1);
1485             dy21             = _mm_sub_ps(iy2,jy1);
1486             dz21             = _mm_sub_ps(iz2,jz1);
1487             dx22             = _mm_sub_ps(ix2,jx2);
1488             dy22             = _mm_sub_ps(iy2,jy2);
1489             dz22             = _mm_sub_ps(iz2,jz2);
1490
1491             /* Calculate squared distance and things based on it */
1492             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1493             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1494             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1495             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1496             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1497             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1498             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1499             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1500             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1501
1502             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1503             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1504             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1505             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1506             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1507             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1508             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1509             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1510             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1511
1512             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1513             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1514             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1515             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1516             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1517             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1518             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1519             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1520             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1521
1522             fjx0             = _mm_setzero_ps();
1523             fjy0             = _mm_setzero_ps();
1524             fjz0             = _mm_setzero_ps();
1525             fjx1             = _mm_setzero_ps();
1526             fjy1             = _mm_setzero_ps();
1527             fjz1             = _mm_setzero_ps();
1528             fjx2             = _mm_setzero_ps();
1529             fjy2             = _mm_setzero_ps();
1530             fjz2             = _mm_setzero_ps();
1531
1532             /**************************
1533              * CALCULATE INTERACTIONS *
1534              **************************/
1535
1536             /* REACTION-FIELD ELECTROSTATICS */
1537             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1538
1539             /* LENNARD-JONES DISPERSION/REPULSION */
1540
1541             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1542             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1543
1544             fscal            = _mm_add_ps(felec,fvdw);
1545
1546             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1547
1548             /* Calculate temporary vectorial force */
1549             tx               = _mm_mul_ps(fscal,dx00);
1550             ty               = _mm_mul_ps(fscal,dy00);
1551             tz               = _mm_mul_ps(fscal,dz00);
1552
1553             /* Update vectorial force */
1554             fix0             = _mm_add_ps(fix0,tx);
1555             fiy0             = _mm_add_ps(fiy0,ty);
1556             fiz0             = _mm_add_ps(fiz0,tz);
1557
1558             fjx0             = _mm_add_ps(fjx0,tx);
1559             fjy0             = _mm_add_ps(fjy0,ty);
1560             fjz0             = _mm_add_ps(fjz0,tz);
1561             
1562             /**************************
1563              * CALCULATE INTERACTIONS *
1564              **************************/
1565
1566             /* REACTION-FIELD ELECTROSTATICS */
1567             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1568
1569             fscal            = felec;
1570
1571             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1572
1573             /* Calculate temporary vectorial force */
1574             tx               = _mm_mul_ps(fscal,dx01);
1575             ty               = _mm_mul_ps(fscal,dy01);
1576             tz               = _mm_mul_ps(fscal,dz01);
1577
1578             /* Update vectorial force */
1579             fix0             = _mm_add_ps(fix0,tx);
1580             fiy0             = _mm_add_ps(fiy0,ty);
1581             fiz0             = _mm_add_ps(fiz0,tz);
1582
1583             fjx1             = _mm_add_ps(fjx1,tx);
1584             fjy1             = _mm_add_ps(fjy1,ty);
1585             fjz1             = _mm_add_ps(fjz1,tz);
1586             
1587             /**************************
1588              * CALCULATE INTERACTIONS *
1589              **************************/
1590
1591             /* REACTION-FIELD ELECTROSTATICS */
1592             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1593
1594             fscal            = felec;
1595
1596             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1597
1598             /* Calculate temporary vectorial force */
1599             tx               = _mm_mul_ps(fscal,dx02);
1600             ty               = _mm_mul_ps(fscal,dy02);
1601             tz               = _mm_mul_ps(fscal,dz02);
1602
1603             /* Update vectorial force */
1604             fix0             = _mm_add_ps(fix0,tx);
1605             fiy0             = _mm_add_ps(fiy0,ty);
1606             fiz0             = _mm_add_ps(fiz0,tz);
1607
1608             fjx2             = _mm_add_ps(fjx2,tx);
1609             fjy2             = _mm_add_ps(fjy2,ty);
1610             fjz2             = _mm_add_ps(fjz2,tz);
1611             
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             /* REACTION-FIELD ELECTROSTATICS */
1617             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1618
1619             fscal            = felec;
1620
1621             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1622
1623             /* Calculate temporary vectorial force */
1624             tx               = _mm_mul_ps(fscal,dx10);
1625             ty               = _mm_mul_ps(fscal,dy10);
1626             tz               = _mm_mul_ps(fscal,dz10);
1627
1628             /* Update vectorial force */
1629             fix1             = _mm_add_ps(fix1,tx);
1630             fiy1             = _mm_add_ps(fiy1,ty);
1631             fiz1             = _mm_add_ps(fiz1,tz);
1632
1633             fjx0             = _mm_add_ps(fjx0,tx);
1634             fjy0             = _mm_add_ps(fjy0,ty);
1635             fjz0             = _mm_add_ps(fjz0,tz);
1636             
1637             /**************************
1638              * CALCULATE INTERACTIONS *
1639              **************************/
1640
1641             /* REACTION-FIELD ELECTROSTATICS */
1642             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1643
1644             fscal            = felec;
1645
1646             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1647
1648             /* Calculate temporary vectorial force */
1649             tx               = _mm_mul_ps(fscal,dx11);
1650             ty               = _mm_mul_ps(fscal,dy11);
1651             tz               = _mm_mul_ps(fscal,dz11);
1652
1653             /* Update vectorial force */
1654             fix1             = _mm_add_ps(fix1,tx);
1655             fiy1             = _mm_add_ps(fiy1,ty);
1656             fiz1             = _mm_add_ps(fiz1,tz);
1657
1658             fjx1             = _mm_add_ps(fjx1,tx);
1659             fjy1             = _mm_add_ps(fjy1,ty);
1660             fjz1             = _mm_add_ps(fjz1,tz);
1661             
1662             /**************************
1663              * CALCULATE INTERACTIONS *
1664              **************************/
1665
1666             /* REACTION-FIELD ELECTROSTATICS */
1667             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1668
1669             fscal            = felec;
1670
1671             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1672
1673             /* Calculate temporary vectorial force */
1674             tx               = _mm_mul_ps(fscal,dx12);
1675             ty               = _mm_mul_ps(fscal,dy12);
1676             tz               = _mm_mul_ps(fscal,dz12);
1677
1678             /* Update vectorial force */
1679             fix1             = _mm_add_ps(fix1,tx);
1680             fiy1             = _mm_add_ps(fiy1,ty);
1681             fiz1             = _mm_add_ps(fiz1,tz);
1682
1683             fjx2             = _mm_add_ps(fjx2,tx);
1684             fjy2             = _mm_add_ps(fjy2,ty);
1685             fjz2             = _mm_add_ps(fjz2,tz);
1686             
1687             /**************************
1688              * CALCULATE INTERACTIONS *
1689              **************************/
1690
1691             /* REACTION-FIELD ELECTROSTATICS */
1692             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1693
1694             fscal            = felec;
1695
1696             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1697
1698             /* Calculate temporary vectorial force */
1699             tx               = _mm_mul_ps(fscal,dx20);
1700             ty               = _mm_mul_ps(fscal,dy20);
1701             tz               = _mm_mul_ps(fscal,dz20);
1702
1703             /* Update vectorial force */
1704             fix2             = _mm_add_ps(fix2,tx);
1705             fiy2             = _mm_add_ps(fiy2,ty);
1706             fiz2             = _mm_add_ps(fiz2,tz);
1707
1708             fjx0             = _mm_add_ps(fjx0,tx);
1709             fjy0             = _mm_add_ps(fjy0,ty);
1710             fjz0             = _mm_add_ps(fjz0,tz);
1711             
1712             /**************************
1713              * CALCULATE INTERACTIONS *
1714              **************************/
1715
1716             /* REACTION-FIELD ELECTROSTATICS */
1717             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1718
1719             fscal            = felec;
1720
1721             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1722
1723             /* Calculate temporary vectorial force */
1724             tx               = _mm_mul_ps(fscal,dx21);
1725             ty               = _mm_mul_ps(fscal,dy21);
1726             tz               = _mm_mul_ps(fscal,dz21);
1727
1728             /* Update vectorial force */
1729             fix2             = _mm_add_ps(fix2,tx);
1730             fiy2             = _mm_add_ps(fiy2,ty);
1731             fiz2             = _mm_add_ps(fiz2,tz);
1732
1733             fjx1             = _mm_add_ps(fjx1,tx);
1734             fjy1             = _mm_add_ps(fjy1,ty);
1735             fjz1             = _mm_add_ps(fjz1,tz);
1736             
1737             /**************************
1738              * CALCULATE INTERACTIONS *
1739              **************************/
1740
1741             /* REACTION-FIELD ELECTROSTATICS */
1742             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1743
1744             fscal            = felec;
1745
1746             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1747
1748             /* Calculate temporary vectorial force */
1749             tx               = _mm_mul_ps(fscal,dx22);
1750             ty               = _mm_mul_ps(fscal,dy22);
1751             tz               = _mm_mul_ps(fscal,dz22);
1752
1753             /* Update vectorial force */
1754             fix2             = _mm_add_ps(fix2,tx);
1755             fiy2             = _mm_add_ps(fiy2,ty);
1756             fiz2             = _mm_add_ps(fiz2,tz);
1757
1758             fjx2             = _mm_add_ps(fjx2,tx);
1759             fjy2             = _mm_add_ps(fjy2,ty);
1760             fjz2             = _mm_add_ps(fjz2,tz);
1761             
1762             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1763             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1764             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1765             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1766
1767             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1768                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1769
1770             /* Inner loop uses 250 flops */
1771         }
1772
1773         /* End of innermost loop */
1774
1775         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1776                                               f+i_coord_offset,fshift+i_shift_offset);
1777
1778         /* Increment number of inner iterations */
1779         inneriter                  += j_index_end - j_index_start;
1780
1781         /* Outer loop uses 18 flops */
1782     }
1783
1784     /* Increment number of outer iterations */
1785     outeriter        += nri;
1786
1787     /* Update outer/inner flops */
1788
1789     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);
1790 }