Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRF_VdwLJ_GeomW4W4_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse2_single.h"
50 #include "kernelutil_x86_sse2_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwioffset3;
92     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
112     real             *charge;
113     int              nvdwtype;
114     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115     int              *vdwtype;
116     real             *vdwparam;
117     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
118     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
119     __m128           dummy_mask,cutoff_mask;
120     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
121     __m128           one     = _mm_set1_ps(1.0);
122     __m128           two     = _mm_set1_ps(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm_set1_ps(fr->epsfac);
135     charge           = mdatoms->chargeA;
136     krf              = _mm_set1_ps(fr->ic->k_rf);
137     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
138     crf              = _mm_set1_ps(fr->ic->c_rf);
139     nvdwtype         = fr->ntype;
140     vdwparam         = fr->nbfp;
141     vdwtype          = mdatoms->typeA;
142
143     /* Setup water-specific parameters */
144     inr              = nlist->iinr[0];
145     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
148     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
149
150     jq1              = _mm_set1_ps(charge[inr+1]);
151     jq2              = _mm_set1_ps(charge[inr+2]);
152     jq3              = _mm_set1_ps(charge[inr+3]);
153     vdwjidx0A        = 2*vdwtype[inr+0];
154     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156     qq11             = _mm_mul_ps(iq1,jq1);
157     qq12             = _mm_mul_ps(iq1,jq2);
158     qq13             = _mm_mul_ps(iq1,jq3);
159     qq21             = _mm_mul_ps(iq2,jq1);
160     qq22             = _mm_mul_ps(iq2,jq2);
161     qq23             = _mm_mul_ps(iq2,jq3);
162     qq31             = _mm_mul_ps(iq3,jq1);
163     qq32             = _mm_mul_ps(iq3,jq2);
164     qq33             = _mm_mul_ps(iq3,jq3);
165
166     /* Avoid stupid compiler warnings */
167     jnrA = jnrB = jnrC = jnrD = 0;
168     j_coord_offsetA = 0;
169     j_coord_offsetB = 0;
170     j_coord_offsetC = 0;
171     j_coord_offsetD = 0;
172
173     outeriter        = 0;
174     inneriter        = 0;
175
176     for(iidx=0;iidx<4*DIM;iidx++)
177     {
178         scratch[iidx] = 0.0;
179     }  
180
181     /* Start outer loop over neighborlists */
182     for(iidx=0; iidx<nri; iidx++)
183     {
184         /* Load shift vector for this list */
185         i_shift_offset   = DIM*shiftidx[iidx];
186
187         /* Load limits for loop over neighbors */
188         j_index_start    = jindex[iidx];
189         j_index_end      = jindex[iidx+1];
190
191         /* Get outer coordinate index */
192         inr              = iinr[iidx];
193         i_coord_offset   = DIM*inr;
194
195         /* Load i particle coords and add shift vector */
196         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
197                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
198         
199         fix0             = _mm_setzero_ps();
200         fiy0             = _mm_setzero_ps();
201         fiz0             = _mm_setzero_ps();
202         fix1             = _mm_setzero_ps();
203         fiy1             = _mm_setzero_ps();
204         fiz1             = _mm_setzero_ps();
205         fix2             = _mm_setzero_ps();
206         fiy2             = _mm_setzero_ps();
207         fiz2             = _mm_setzero_ps();
208         fix3             = _mm_setzero_ps();
209         fiy3             = _mm_setzero_ps();
210         fiz3             = _mm_setzero_ps();
211
212         /* Reset potential sums */
213         velecsum         = _mm_setzero_ps();
214         vvdwsum          = _mm_setzero_ps();
215
216         /* Start inner kernel loop */
217         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218         {
219
220             /* Get j neighbor index, and coordinate index */
221             jnrA             = jjnr[jidx];
222             jnrB             = jjnr[jidx+1];
223             jnrC             = jjnr[jidx+2];
224             jnrD             = jjnr[jidx+3];
225             j_coord_offsetA  = DIM*jnrA;
226             j_coord_offsetB  = DIM*jnrB;
227             j_coord_offsetC  = DIM*jnrC;
228             j_coord_offsetD  = DIM*jnrD;
229
230             /* load j atom coordinates */
231             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
232                                               x+j_coord_offsetC,x+j_coord_offsetD,
233                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
234                                               &jy2,&jz2,&jx3,&jy3,&jz3);
235
236             /* Calculate displacement vector */
237             dx00             = _mm_sub_ps(ix0,jx0);
238             dy00             = _mm_sub_ps(iy0,jy0);
239             dz00             = _mm_sub_ps(iz0,jz0);
240             dx11             = _mm_sub_ps(ix1,jx1);
241             dy11             = _mm_sub_ps(iy1,jy1);
242             dz11             = _mm_sub_ps(iz1,jz1);
243             dx12             = _mm_sub_ps(ix1,jx2);
244             dy12             = _mm_sub_ps(iy1,jy2);
245             dz12             = _mm_sub_ps(iz1,jz2);
246             dx13             = _mm_sub_ps(ix1,jx3);
247             dy13             = _mm_sub_ps(iy1,jy3);
248             dz13             = _mm_sub_ps(iz1,jz3);
249             dx21             = _mm_sub_ps(ix2,jx1);
250             dy21             = _mm_sub_ps(iy2,jy1);
251             dz21             = _mm_sub_ps(iz2,jz1);
252             dx22             = _mm_sub_ps(ix2,jx2);
253             dy22             = _mm_sub_ps(iy2,jy2);
254             dz22             = _mm_sub_ps(iz2,jz2);
255             dx23             = _mm_sub_ps(ix2,jx3);
256             dy23             = _mm_sub_ps(iy2,jy3);
257             dz23             = _mm_sub_ps(iz2,jz3);
258             dx31             = _mm_sub_ps(ix3,jx1);
259             dy31             = _mm_sub_ps(iy3,jy1);
260             dz31             = _mm_sub_ps(iz3,jz1);
261             dx32             = _mm_sub_ps(ix3,jx2);
262             dy32             = _mm_sub_ps(iy3,jy2);
263             dz32             = _mm_sub_ps(iz3,jz2);
264             dx33             = _mm_sub_ps(ix3,jx3);
265             dy33             = _mm_sub_ps(iy3,jy3);
266             dz33             = _mm_sub_ps(iz3,jz3);
267
268             /* Calculate squared distance and things based on it */
269             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
270             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
271             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
272             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
273             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
274             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
275             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
276             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
277             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
278             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
279
280             rinv11           = gmx_mm_invsqrt_ps(rsq11);
281             rinv12           = gmx_mm_invsqrt_ps(rsq12);
282             rinv13           = gmx_mm_invsqrt_ps(rsq13);
283             rinv21           = gmx_mm_invsqrt_ps(rsq21);
284             rinv22           = gmx_mm_invsqrt_ps(rsq22);
285             rinv23           = gmx_mm_invsqrt_ps(rsq23);
286             rinv31           = gmx_mm_invsqrt_ps(rsq31);
287             rinv32           = gmx_mm_invsqrt_ps(rsq32);
288             rinv33           = gmx_mm_invsqrt_ps(rsq33);
289
290             rinvsq00         = gmx_mm_inv_ps(rsq00);
291             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
292             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
293             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
294             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
295             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
296             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
297             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
298             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
299             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
300
301             fjx0             = _mm_setzero_ps();
302             fjy0             = _mm_setzero_ps();
303             fjz0             = _mm_setzero_ps();
304             fjx1             = _mm_setzero_ps();
305             fjy1             = _mm_setzero_ps();
306             fjz1             = _mm_setzero_ps();
307             fjx2             = _mm_setzero_ps();
308             fjy2             = _mm_setzero_ps();
309             fjz2             = _mm_setzero_ps();
310             fjx3             = _mm_setzero_ps();
311             fjy3             = _mm_setzero_ps();
312             fjz3             = _mm_setzero_ps();
313
314             /**************************
315              * CALCULATE INTERACTIONS *
316              **************************/
317
318             /* LENNARD-JONES DISPERSION/REPULSION */
319
320             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
321             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
322             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
323             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
324             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
325
326             /* Update potential sum for this i atom from the interaction with this j atom. */
327             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
328
329             fscal            = fvdw;
330
331             /* Calculate temporary vectorial force */
332             tx               = _mm_mul_ps(fscal,dx00);
333             ty               = _mm_mul_ps(fscal,dy00);
334             tz               = _mm_mul_ps(fscal,dz00);
335
336             /* Update vectorial force */
337             fix0             = _mm_add_ps(fix0,tx);
338             fiy0             = _mm_add_ps(fiy0,ty);
339             fiz0             = _mm_add_ps(fiz0,tz);
340
341             fjx0             = _mm_add_ps(fjx0,tx);
342             fjy0             = _mm_add_ps(fjy0,ty);
343             fjz0             = _mm_add_ps(fjz0,tz);
344             
345             /**************************
346              * CALCULATE INTERACTIONS *
347              **************************/
348
349             /* REACTION-FIELD ELECTROSTATICS */
350             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
351             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
352
353             /* Update potential sum for this i atom from the interaction with this j atom. */
354             velecsum         = _mm_add_ps(velecsum,velec);
355
356             fscal            = felec;
357
358             /* Calculate temporary vectorial force */
359             tx               = _mm_mul_ps(fscal,dx11);
360             ty               = _mm_mul_ps(fscal,dy11);
361             tz               = _mm_mul_ps(fscal,dz11);
362
363             /* Update vectorial force */
364             fix1             = _mm_add_ps(fix1,tx);
365             fiy1             = _mm_add_ps(fiy1,ty);
366             fiz1             = _mm_add_ps(fiz1,tz);
367
368             fjx1             = _mm_add_ps(fjx1,tx);
369             fjy1             = _mm_add_ps(fjy1,ty);
370             fjz1             = _mm_add_ps(fjz1,tz);
371             
372             /**************************
373              * CALCULATE INTERACTIONS *
374              **************************/
375
376             /* REACTION-FIELD ELECTROSTATICS */
377             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
378             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
379
380             /* Update potential sum for this i atom from the interaction with this j atom. */
381             velecsum         = _mm_add_ps(velecsum,velec);
382
383             fscal            = felec;
384
385             /* Calculate temporary vectorial force */
386             tx               = _mm_mul_ps(fscal,dx12);
387             ty               = _mm_mul_ps(fscal,dy12);
388             tz               = _mm_mul_ps(fscal,dz12);
389
390             /* Update vectorial force */
391             fix1             = _mm_add_ps(fix1,tx);
392             fiy1             = _mm_add_ps(fiy1,ty);
393             fiz1             = _mm_add_ps(fiz1,tz);
394
395             fjx2             = _mm_add_ps(fjx2,tx);
396             fjy2             = _mm_add_ps(fjy2,ty);
397             fjz2             = _mm_add_ps(fjz2,tz);
398             
399             /**************************
400              * CALCULATE INTERACTIONS *
401              **************************/
402
403             /* REACTION-FIELD ELECTROSTATICS */
404             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
405             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
406
407             /* Update potential sum for this i atom from the interaction with this j atom. */
408             velecsum         = _mm_add_ps(velecsum,velec);
409
410             fscal            = felec;
411
412             /* Calculate temporary vectorial force */
413             tx               = _mm_mul_ps(fscal,dx13);
414             ty               = _mm_mul_ps(fscal,dy13);
415             tz               = _mm_mul_ps(fscal,dz13);
416
417             /* Update vectorial force */
418             fix1             = _mm_add_ps(fix1,tx);
419             fiy1             = _mm_add_ps(fiy1,ty);
420             fiz1             = _mm_add_ps(fiz1,tz);
421
422             fjx3             = _mm_add_ps(fjx3,tx);
423             fjy3             = _mm_add_ps(fjy3,ty);
424             fjz3             = _mm_add_ps(fjz3,tz);
425             
426             /**************************
427              * CALCULATE INTERACTIONS *
428              **************************/
429
430             /* REACTION-FIELD ELECTROSTATICS */
431             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
432             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
433
434             /* Update potential sum for this i atom from the interaction with this j atom. */
435             velecsum         = _mm_add_ps(velecsum,velec);
436
437             fscal            = felec;
438
439             /* Calculate temporary vectorial force */
440             tx               = _mm_mul_ps(fscal,dx21);
441             ty               = _mm_mul_ps(fscal,dy21);
442             tz               = _mm_mul_ps(fscal,dz21);
443
444             /* Update vectorial force */
445             fix2             = _mm_add_ps(fix2,tx);
446             fiy2             = _mm_add_ps(fiy2,ty);
447             fiz2             = _mm_add_ps(fiz2,tz);
448
449             fjx1             = _mm_add_ps(fjx1,tx);
450             fjy1             = _mm_add_ps(fjy1,ty);
451             fjz1             = _mm_add_ps(fjz1,tz);
452             
453             /**************************
454              * CALCULATE INTERACTIONS *
455              **************************/
456
457             /* REACTION-FIELD ELECTROSTATICS */
458             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
459             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velecsum         = _mm_add_ps(velecsum,velec);
463
464             fscal            = felec;
465
466             /* Calculate temporary vectorial force */
467             tx               = _mm_mul_ps(fscal,dx22);
468             ty               = _mm_mul_ps(fscal,dy22);
469             tz               = _mm_mul_ps(fscal,dz22);
470
471             /* Update vectorial force */
472             fix2             = _mm_add_ps(fix2,tx);
473             fiy2             = _mm_add_ps(fiy2,ty);
474             fiz2             = _mm_add_ps(fiz2,tz);
475
476             fjx2             = _mm_add_ps(fjx2,tx);
477             fjy2             = _mm_add_ps(fjy2,ty);
478             fjz2             = _mm_add_ps(fjz2,tz);
479             
480             /**************************
481              * CALCULATE INTERACTIONS *
482              **************************/
483
484             /* REACTION-FIELD ELECTROSTATICS */
485             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
486             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
487
488             /* Update potential sum for this i atom from the interaction with this j atom. */
489             velecsum         = _mm_add_ps(velecsum,velec);
490
491             fscal            = felec;
492
493             /* Calculate temporary vectorial force */
494             tx               = _mm_mul_ps(fscal,dx23);
495             ty               = _mm_mul_ps(fscal,dy23);
496             tz               = _mm_mul_ps(fscal,dz23);
497
498             /* Update vectorial force */
499             fix2             = _mm_add_ps(fix2,tx);
500             fiy2             = _mm_add_ps(fiy2,ty);
501             fiz2             = _mm_add_ps(fiz2,tz);
502
503             fjx3             = _mm_add_ps(fjx3,tx);
504             fjy3             = _mm_add_ps(fjy3,ty);
505             fjz3             = _mm_add_ps(fjz3,tz);
506             
507             /**************************
508              * CALCULATE INTERACTIONS *
509              **************************/
510
511             /* REACTION-FIELD ELECTROSTATICS */
512             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
513             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
514
515             /* Update potential sum for this i atom from the interaction with this j atom. */
516             velecsum         = _mm_add_ps(velecsum,velec);
517
518             fscal            = felec;
519
520             /* Calculate temporary vectorial force */
521             tx               = _mm_mul_ps(fscal,dx31);
522             ty               = _mm_mul_ps(fscal,dy31);
523             tz               = _mm_mul_ps(fscal,dz31);
524
525             /* Update vectorial force */
526             fix3             = _mm_add_ps(fix3,tx);
527             fiy3             = _mm_add_ps(fiy3,ty);
528             fiz3             = _mm_add_ps(fiz3,tz);
529
530             fjx1             = _mm_add_ps(fjx1,tx);
531             fjy1             = _mm_add_ps(fjy1,ty);
532             fjz1             = _mm_add_ps(fjz1,tz);
533             
534             /**************************
535              * CALCULATE INTERACTIONS *
536              **************************/
537
538             /* REACTION-FIELD ELECTROSTATICS */
539             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
540             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
541
542             /* Update potential sum for this i atom from the interaction with this j atom. */
543             velecsum         = _mm_add_ps(velecsum,velec);
544
545             fscal            = felec;
546
547             /* Calculate temporary vectorial force */
548             tx               = _mm_mul_ps(fscal,dx32);
549             ty               = _mm_mul_ps(fscal,dy32);
550             tz               = _mm_mul_ps(fscal,dz32);
551
552             /* Update vectorial force */
553             fix3             = _mm_add_ps(fix3,tx);
554             fiy3             = _mm_add_ps(fiy3,ty);
555             fiz3             = _mm_add_ps(fiz3,tz);
556
557             fjx2             = _mm_add_ps(fjx2,tx);
558             fjy2             = _mm_add_ps(fjy2,ty);
559             fjz2             = _mm_add_ps(fjz2,tz);
560             
561             /**************************
562              * CALCULATE INTERACTIONS *
563              **************************/
564
565             /* REACTION-FIELD ELECTROSTATICS */
566             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
567             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
568
569             /* Update potential sum for this i atom from the interaction with this j atom. */
570             velecsum         = _mm_add_ps(velecsum,velec);
571
572             fscal            = felec;
573
574             /* Calculate temporary vectorial force */
575             tx               = _mm_mul_ps(fscal,dx33);
576             ty               = _mm_mul_ps(fscal,dy33);
577             tz               = _mm_mul_ps(fscal,dz33);
578
579             /* Update vectorial force */
580             fix3             = _mm_add_ps(fix3,tx);
581             fiy3             = _mm_add_ps(fiy3,ty);
582             fiz3             = _mm_add_ps(fiz3,tz);
583
584             fjx3             = _mm_add_ps(fjx3,tx);
585             fjy3             = _mm_add_ps(fjy3,ty);
586             fjz3             = _mm_add_ps(fjz3,tz);
587             
588             fjptrA             = f+j_coord_offsetA;
589             fjptrB             = f+j_coord_offsetB;
590             fjptrC             = f+j_coord_offsetC;
591             fjptrD             = f+j_coord_offsetD;
592
593             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
594                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
595                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
596
597             /* Inner loop uses 323 flops */
598         }
599
600         if(jidx<j_index_end)
601         {
602
603             /* Get j neighbor index, and coordinate index */
604             jnrlistA         = jjnr[jidx];
605             jnrlistB         = jjnr[jidx+1];
606             jnrlistC         = jjnr[jidx+2];
607             jnrlistD         = jjnr[jidx+3];
608             /* Sign of each element will be negative for non-real atoms.
609              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
610              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
611              */
612             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
613             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
614             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
615             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
616             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
617             j_coord_offsetA  = DIM*jnrA;
618             j_coord_offsetB  = DIM*jnrB;
619             j_coord_offsetC  = DIM*jnrC;
620             j_coord_offsetD  = DIM*jnrD;
621
622             /* load j atom coordinates */
623             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
624                                               x+j_coord_offsetC,x+j_coord_offsetD,
625                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
626                                               &jy2,&jz2,&jx3,&jy3,&jz3);
627
628             /* Calculate displacement vector */
629             dx00             = _mm_sub_ps(ix0,jx0);
630             dy00             = _mm_sub_ps(iy0,jy0);
631             dz00             = _mm_sub_ps(iz0,jz0);
632             dx11             = _mm_sub_ps(ix1,jx1);
633             dy11             = _mm_sub_ps(iy1,jy1);
634             dz11             = _mm_sub_ps(iz1,jz1);
635             dx12             = _mm_sub_ps(ix1,jx2);
636             dy12             = _mm_sub_ps(iy1,jy2);
637             dz12             = _mm_sub_ps(iz1,jz2);
638             dx13             = _mm_sub_ps(ix1,jx3);
639             dy13             = _mm_sub_ps(iy1,jy3);
640             dz13             = _mm_sub_ps(iz1,jz3);
641             dx21             = _mm_sub_ps(ix2,jx1);
642             dy21             = _mm_sub_ps(iy2,jy1);
643             dz21             = _mm_sub_ps(iz2,jz1);
644             dx22             = _mm_sub_ps(ix2,jx2);
645             dy22             = _mm_sub_ps(iy2,jy2);
646             dz22             = _mm_sub_ps(iz2,jz2);
647             dx23             = _mm_sub_ps(ix2,jx3);
648             dy23             = _mm_sub_ps(iy2,jy3);
649             dz23             = _mm_sub_ps(iz2,jz3);
650             dx31             = _mm_sub_ps(ix3,jx1);
651             dy31             = _mm_sub_ps(iy3,jy1);
652             dz31             = _mm_sub_ps(iz3,jz1);
653             dx32             = _mm_sub_ps(ix3,jx2);
654             dy32             = _mm_sub_ps(iy3,jy2);
655             dz32             = _mm_sub_ps(iz3,jz2);
656             dx33             = _mm_sub_ps(ix3,jx3);
657             dy33             = _mm_sub_ps(iy3,jy3);
658             dz33             = _mm_sub_ps(iz3,jz3);
659
660             /* Calculate squared distance and things based on it */
661             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
662             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
663             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
664             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
665             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
666             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
667             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
668             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
669             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
670             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
671
672             rinv11           = gmx_mm_invsqrt_ps(rsq11);
673             rinv12           = gmx_mm_invsqrt_ps(rsq12);
674             rinv13           = gmx_mm_invsqrt_ps(rsq13);
675             rinv21           = gmx_mm_invsqrt_ps(rsq21);
676             rinv22           = gmx_mm_invsqrt_ps(rsq22);
677             rinv23           = gmx_mm_invsqrt_ps(rsq23);
678             rinv31           = gmx_mm_invsqrt_ps(rsq31);
679             rinv32           = gmx_mm_invsqrt_ps(rsq32);
680             rinv33           = gmx_mm_invsqrt_ps(rsq33);
681
682             rinvsq00         = gmx_mm_inv_ps(rsq00);
683             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
684             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
685             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
686             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
687             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
688             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
689             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
690             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
691             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
692
693             fjx0             = _mm_setzero_ps();
694             fjy0             = _mm_setzero_ps();
695             fjz0             = _mm_setzero_ps();
696             fjx1             = _mm_setzero_ps();
697             fjy1             = _mm_setzero_ps();
698             fjz1             = _mm_setzero_ps();
699             fjx2             = _mm_setzero_ps();
700             fjy2             = _mm_setzero_ps();
701             fjz2             = _mm_setzero_ps();
702             fjx3             = _mm_setzero_ps();
703             fjy3             = _mm_setzero_ps();
704             fjz3             = _mm_setzero_ps();
705
706             /**************************
707              * CALCULATE INTERACTIONS *
708              **************************/
709
710             /* LENNARD-JONES DISPERSION/REPULSION */
711
712             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
713             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
714             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
715             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
716             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
717
718             /* Update potential sum for this i atom from the interaction with this j atom. */
719             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
720             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
721
722             fscal            = fvdw;
723
724             fscal            = _mm_andnot_ps(dummy_mask,fscal);
725
726             /* Calculate temporary vectorial force */
727             tx               = _mm_mul_ps(fscal,dx00);
728             ty               = _mm_mul_ps(fscal,dy00);
729             tz               = _mm_mul_ps(fscal,dz00);
730
731             /* Update vectorial force */
732             fix0             = _mm_add_ps(fix0,tx);
733             fiy0             = _mm_add_ps(fiy0,ty);
734             fiz0             = _mm_add_ps(fiz0,tz);
735
736             fjx0             = _mm_add_ps(fjx0,tx);
737             fjy0             = _mm_add_ps(fjy0,ty);
738             fjz0             = _mm_add_ps(fjz0,tz);
739             
740             /**************************
741              * CALCULATE INTERACTIONS *
742              **************************/
743
744             /* REACTION-FIELD ELECTROSTATICS */
745             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
746             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
747
748             /* Update potential sum for this i atom from the interaction with this j atom. */
749             velec            = _mm_andnot_ps(dummy_mask,velec);
750             velecsum         = _mm_add_ps(velecsum,velec);
751
752             fscal            = felec;
753
754             fscal            = _mm_andnot_ps(dummy_mask,fscal);
755
756             /* Calculate temporary vectorial force */
757             tx               = _mm_mul_ps(fscal,dx11);
758             ty               = _mm_mul_ps(fscal,dy11);
759             tz               = _mm_mul_ps(fscal,dz11);
760
761             /* Update vectorial force */
762             fix1             = _mm_add_ps(fix1,tx);
763             fiy1             = _mm_add_ps(fiy1,ty);
764             fiz1             = _mm_add_ps(fiz1,tz);
765
766             fjx1             = _mm_add_ps(fjx1,tx);
767             fjy1             = _mm_add_ps(fjy1,ty);
768             fjz1             = _mm_add_ps(fjz1,tz);
769             
770             /**************************
771              * CALCULATE INTERACTIONS *
772              **************************/
773
774             /* REACTION-FIELD ELECTROSTATICS */
775             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
776             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
777
778             /* Update potential sum for this i atom from the interaction with this j atom. */
779             velec            = _mm_andnot_ps(dummy_mask,velec);
780             velecsum         = _mm_add_ps(velecsum,velec);
781
782             fscal            = felec;
783
784             fscal            = _mm_andnot_ps(dummy_mask,fscal);
785
786             /* Calculate temporary vectorial force */
787             tx               = _mm_mul_ps(fscal,dx12);
788             ty               = _mm_mul_ps(fscal,dy12);
789             tz               = _mm_mul_ps(fscal,dz12);
790
791             /* Update vectorial force */
792             fix1             = _mm_add_ps(fix1,tx);
793             fiy1             = _mm_add_ps(fiy1,ty);
794             fiz1             = _mm_add_ps(fiz1,tz);
795
796             fjx2             = _mm_add_ps(fjx2,tx);
797             fjy2             = _mm_add_ps(fjy2,ty);
798             fjz2             = _mm_add_ps(fjz2,tz);
799             
800             /**************************
801              * CALCULATE INTERACTIONS *
802              **************************/
803
804             /* REACTION-FIELD ELECTROSTATICS */
805             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
806             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
807
808             /* Update potential sum for this i atom from the interaction with this j atom. */
809             velec            = _mm_andnot_ps(dummy_mask,velec);
810             velecsum         = _mm_add_ps(velecsum,velec);
811
812             fscal            = felec;
813
814             fscal            = _mm_andnot_ps(dummy_mask,fscal);
815
816             /* Calculate temporary vectorial force */
817             tx               = _mm_mul_ps(fscal,dx13);
818             ty               = _mm_mul_ps(fscal,dy13);
819             tz               = _mm_mul_ps(fscal,dz13);
820
821             /* Update vectorial force */
822             fix1             = _mm_add_ps(fix1,tx);
823             fiy1             = _mm_add_ps(fiy1,ty);
824             fiz1             = _mm_add_ps(fiz1,tz);
825
826             fjx3             = _mm_add_ps(fjx3,tx);
827             fjy3             = _mm_add_ps(fjy3,ty);
828             fjz3             = _mm_add_ps(fjz3,tz);
829             
830             /**************************
831              * CALCULATE INTERACTIONS *
832              **************************/
833
834             /* REACTION-FIELD ELECTROSTATICS */
835             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
836             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
837
838             /* Update potential sum for this i atom from the interaction with this j atom. */
839             velec            = _mm_andnot_ps(dummy_mask,velec);
840             velecsum         = _mm_add_ps(velecsum,velec);
841
842             fscal            = felec;
843
844             fscal            = _mm_andnot_ps(dummy_mask,fscal);
845
846             /* Calculate temporary vectorial force */
847             tx               = _mm_mul_ps(fscal,dx21);
848             ty               = _mm_mul_ps(fscal,dy21);
849             tz               = _mm_mul_ps(fscal,dz21);
850
851             /* Update vectorial force */
852             fix2             = _mm_add_ps(fix2,tx);
853             fiy2             = _mm_add_ps(fiy2,ty);
854             fiz2             = _mm_add_ps(fiz2,tz);
855
856             fjx1             = _mm_add_ps(fjx1,tx);
857             fjy1             = _mm_add_ps(fjy1,ty);
858             fjz1             = _mm_add_ps(fjz1,tz);
859             
860             /**************************
861              * CALCULATE INTERACTIONS *
862              **************************/
863
864             /* REACTION-FIELD ELECTROSTATICS */
865             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
866             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
867
868             /* Update potential sum for this i atom from the interaction with this j atom. */
869             velec            = _mm_andnot_ps(dummy_mask,velec);
870             velecsum         = _mm_add_ps(velecsum,velec);
871
872             fscal            = felec;
873
874             fscal            = _mm_andnot_ps(dummy_mask,fscal);
875
876             /* Calculate temporary vectorial force */
877             tx               = _mm_mul_ps(fscal,dx22);
878             ty               = _mm_mul_ps(fscal,dy22);
879             tz               = _mm_mul_ps(fscal,dz22);
880
881             /* Update vectorial force */
882             fix2             = _mm_add_ps(fix2,tx);
883             fiy2             = _mm_add_ps(fiy2,ty);
884             fiz2             = _mm_add_ps(fiz2,tz);
885
886             fjx2             = _mm_add_ps(fjx2,tx);
887             fjy2             = _mm_add_ps(fjy2,ty);
888             fjz2             = _mm_add_ps(fjz2,tz);
889             
890             /**************************
891              * CALCULATE INTERACTIONS *
892              **************************/
893
894             /* REACTION-FIELD ELECTROSTATICS */
895             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
896             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
897
898             /* Update potential sum for this i atom from the interaction with this j atom. */
899             velec            = _mm_andnot_ps(dummy_mask,velec);
900             velecsum         = _mm_add_ps(velecsum,velec);
901
902             fscal            = felec;
903
904             fscal            = _mm_andnot_ps(dummy_mask,fscal);
905
906             /* Calculate temporary vectorial force */
907             tx               = _mm_mul_ps(fscal,dx23);
908             ty               = _mm_mul_ps(fscal,dy23);
909             tz               = _mm_mul_ps(fscal,dz23);
910
911             /* Update vectorial force */
912             fix2             = _mm_add_ps(fix2,tx);
913             fiy2             = _mm_add_ps(fiy2,ty);
914             fiz2             = _mm_add_ps(fiz2,tz);
915
916             fjx3             = _mm_add_ps(fjx3,tx);
917             fjy3             = _mm_add_ps(fjy3,ty);
918             fjz3             = _mm_add_ps(fjz3,tz);
919             
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             /* REACTION-FIELD ELECTROSTATICS */
925             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
926             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
927
928             /* Update potential sum for this i atom from the interaction with this j atom. */
929             velec            = _mm_andnot_ps(dummy_mask,velec);
930             velecsum         = _mm_add_ps(velecsum,velec);
931
932             fscal            = felec;
933
934             fscal            = _mm_andnot_ps(dummy_mask,fscal);
935
936             /* Calculate temporary vectorial force */
937             tx               = _mm_mul_ps(fscal,dx31);
938             ty               = _mm_mul_ps(fscal,dy31);
939             tz               = _mm_mul_ps(fscal,dz31);
940
941             /* Update vectorial force */
942             fix3             = _mm_add_ps(fix3,tx);
943             fiy3             = _mm_add_ps(fiy3,ty);
944             fiz3             = _mm_add_ps(fiz3,tz);
945
946             fjx1             = _mm_add_ps(fjx1,tx);
947             fjy1             = _mm_add_ps(fjy1,ty);
948             fjz1             = _mm_add_ps(fjz1,tz);
949             
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             /* REACTION-FIELD ELECTROSTATICS */
955             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
956             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
957
958             /* Update potential sum for this i atom from the interaction with this j atom. */
959             velec            = _mm_andnot_ps(dummy_mask,velec);
960             velecsum         = _mm_add_ps(velecsum,velec);
961
962             fscal            = felec;
963
964             fscal            = _mm_andnot_ps(dummy_mask,fscal);
965
966             /* Calculate temporary vectorial force */
967             tx               = _mm_mul_ps(fscal,dx32);
968             ty               = _mm_mul_ps(fscal,dy32);
969             tz               = _mm_mul_ps(fscal,dz32);
970
971             /* Update vectorial force */
972             fix3             = _mm_add_ps(fix3,tx);
973             fiy3             = _mm_add_ps(fiy3,ty);
974             fiz3             = _mm_add_ps(fiz3,tz);
975
976             fjx2             = _mm_add_ps(fjx2,tx);
977             fjy2             = _mm_add_ps(fjy2,ty);
978             fjz2             = _mm_add_ps(fjz2,tz);
979             
980             /**************************
981              * CALCULATE INTERACTIONS *
982              **************************/
983
984             /* REACTION-FIELD ELECTROSTATICS */
985             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
986             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
987
988             /* Update potential sum for this i atom from the interaction with this j atom. */
989             velec            = _mm_andnot_ps(dummy_mask,velec);
990             velecsum         = _mm_add_ps(velecsum,velec);
991
992             fscal            = felec;
993
994             fscal            = _mm_andnot_ps(dummy_mask,fscal);
995
996             /* Calculate temporary vectorial force */
997             tx               = _mm_mul_ps(fscal,dx33);
998             ty               = _mm_mul_ps(fscal,dy33);
999             tz               = _mm_mul_ps(fscal,dz33);
1000
1001             /* Update vectorial force */
1002             fix3             = _mm_add_ps(fix3,tx);
1003             fiy3             = _mm_add_ps(fiy3,ty);
1004             fiz3             = _mm_add_ps(fiz3,tz);
1005
1006             fjx3             = _mm_add_ps(fjx3,tx);
1007             fjy3             = _mm_add_ps(fjy3,ty);
1008             fjz3             = _mm_add_ps(fjz3,tz);
1009             
1010             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1011             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1012             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1013             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1014
1015             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1016                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1017                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1018
1019             /* Inner loop uses 323 flops */
1020         }
1021
1022         /* End of innermost loop */
1023
1024         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1025                                               f+i_coord_offset,fshift+i_shift_offset);
1026
1027         ggid                        = gid[iidx];
1028         /* Update potential energies */
1029         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1030         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1031
1032         /* Increment number of inner iterations */
1033         inneriter                  += j_index_end - j_index_start;
1034
1035         /* Outer loop uses 26 flops */
1036     }
1037
1038     /* Increment number of outer iterations */
1039     outeriter        += nri;
1040
1041     /* Update outer/inner flops */
1042
1043     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*323);
1044 }
1045 /*
1046  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single
1047  * Electrostatics interaction: ReactionField
1048  * VdW interaction:            LennardJones
1049  * Geometry:                   Water4-Water4
1050  * Calculate force/pot:        Force
1051  */
1052 void
1053 nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single
1054                     (t_nblist                    * gmx_restrict       nlist,
1055                      rvec                        * gmx_restrict          xx,
1056                      rvec                        * gmx_restrict          ff,
1057                      t_forcerec                  * gmx_restrict          fr,
1058                      t_mdatoms                   * gmx_restrict     mdatoms,
1059                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1060                      t_nrnb                      * gmx_restrict        nrnb)
1061 {
1062     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1063      * just 0 for non-waters.
1064      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1065      * jnr indices corresponding to data put in the four positions in the SIMD register.
1066      */
1067     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1068     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1069     int              jnrA,jnrB,jnrC,jnrD;
1070     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1071     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1072     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1073     real             rcutoff_scalar;
1074     real             *shiftvec,*fshift,*x,*f;
1075     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1076     real             scratch[4*DIM];
1077     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1078     int              vdwioffset0;
1079     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1080     int              vdwioffset1;
1081     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1082     int              vdwioffset2;
1083     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1084     int              vdwioffset3;
1085     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1086     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1087     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1088     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1089     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1090     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1091     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1092     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1093     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1094     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1095     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1096     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1097     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1098     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1099     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1100     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1101     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1102     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1103     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1105     real             *charge;
1106     int              nvdwtype;
1107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1108     int              *vdwtype;
1109     real             *vdwparam;
1110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1112     __m128           dummy_mask,cutoff_mask;
1113     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1114     __m128           one     = _mm_set1_ps(1.0);
1115     __m128           two     = _mm_set1_ps(2.0);
1116     x                = xx[0];
1117     f                = ff[0];
1118
1119     nri              = nlist->nri;
1120     iinr             = nlist->iinr;
1121     jindex           = nlist->jindex;
1122     jjnr             = nlist->jjnr;
1123     shiftidx         = nlist->shift;
1124     gid              = nlist->gid;
1125     shiftvec         = fr->shift_vec[0];
1126     fshift           = fr->fshift[0];
1127     facel            = _mm_set1_ps(fr->epsfac);
1128     charge           = mdatoms->chargeA;
1129     krf              = _mm_set1_ps(fr->ic->k_rf);
1130     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1131     crf              = _mm_set1_ps(fr->ic->c_rf);
1132     nvdwtype         = fr->ntype;
1133     vdwparam         = fr->nbfp;
1134     vdwtype          = mdatoms->typeA;
1135
1136     /* Setup water-specific parameters */
1137     inr              = nlist->iinr[0];
1138     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1139     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1140     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1141     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1142
1143     jq1              = _mm_set1_ps(charge[inr+1]);
1144     jq2              = _mm_set1_ps(charge[inr+2]);
1145     jq3              = _mm_set1_ps(charge[inr+3]);
1146     vdwjidx0A        = 2*vdwtype[inr+0];
1147     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1148     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1149     qq11             = _mm_mul_ps(iq1,jq1);
1150     qq12             = _mm_mul_ps(iq1,jq2);
1151     qq13             = _mm_mul_ps(iq1,jq3);
1152     qq21             = _mm_mul_ps(iq2,jq1);
1153     qq22             = _mm_mul_ps(iq2,jq2);
1154     qq23             = _mm_mul_ps(iq2,jq3);
1155     qq31             = _mm_mul_ps(iq3,jq1);
1156     qq32             = _mm_mul_ps(iq3,jq2);
1157     qq33             = _mm_mul_ps(iq3,jq3);
1158
1159     /* Avoid stupid compiler warnings */
1160     jnrA = jnrB = jnrC = jnrD = 0;
1161     j_coord_offsetA = 0;
1162     j_coord_offsetB = 0;
1163     j_coord_offsetC = 0;
1164     j_coord_offsetD = 0;
1165
1166     outeriter        = 0;
1167     inneriter        = 0;
1168
1169     for(iidx=0;iidx<4*DIM;iidx++)
1170     {
1171         scratch[iidx] = 0.0;
1172     }  
1173
1174     /* Start outer loop over neighborlists */
1175     for(iidx=0; iidx<nri; iidx++)
1176     {
1177         /* Load shift vector for this list */
1178         i_shift_offset   = DIM*shiftidx[iidx];
1179
1180         /* Load limits for loop over neighbors */
1181         j_index_start    = jindex[iidx];
1182         j_index_end      = jindex[iidx+1];
1183
1184         /* Get outer coordinate index */
1185         inr              = iinr[iidx];
1186         i_coord_offset   = DIM*inr;
1187
1188         /* Load i particle coords and add shift vector */
1189         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1190                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1191         
1192         fix0             = _mm_setzero_ps();
1193         fiy0             = _mm_setzero_ps();
1194         fiz0             = _mm_setzero_ps();
1195         fix1             = _mm_setzero_ps();
1196         fiy1             = _mm_setzero_ps();
1197         fiz1             = _mm_setzero_ps();
1198         fix2             = _mm_setzero_ps();
1199         fiy2             = _mm_setzero_ps();
1200         fiz2             = _mm_setzero_ps();
1201         fix3             = _mm_setzero_ps();
1202         fiy3             = _mm_setzero_ps();
1203         fiz3             = _mm_setzero_ps();
1204
1205         /* Start inner kernel loop */
1206         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1207         {
1208
1209             /* Get j neighbor index, and coordinate index */
1210             jnrA             = jjnr[jidx];
1211             jnrB             = jjnr[jidx+1];
1212             jnrC             = jjnr[jidx+2];
1213             jnrD             = jjnr[jidx+3];
1214             j_coord_offsetA  = DIM*jnrA;
1215             j_coord_offsetB  = DIM*jnrB;
1216             j_coord_offsetC  = DIM*jnrC;
1217             j_coord_offsetD  = DIM*jnrD;
1218
1219             /* load j atom coordinates */
1220             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1221                                               x+j_coord_offsetC,x+j_coord_offsetD,
1222                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1223                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1224
1225             /* Calculate displacement vector */
1226             dx00             = _mm_sub_ps(ix0,jx0);
1227             dy00             = _mm_sub_ps(iy0,jy0);
1228             dz00             = _mm_sub_ps(iz0,jz0);
1229             dx11             = _mm_sub_ps(ix1,jx1);
1230             dy11             = _mm_sub_ps(iy1,jy1);
1231             dz11             = _mm_sub_ps(iz1,jz1);
1232             dx12             = _mm_sub_ps(ix1,jx2);
1233             dy12             = _mm_sub_ps(iy1,jy2);
1234             dz12             = _mm_sub_ps(iz1,jz2);
1235             dx13             = _mm_sub_ps(ix1,jx3);
1236             dy13             = _mm_sub_ps(iy1,jy3);
1237             dz13             = _mm_sub_ps(iz1,jz3);
1238             dx21             = _mm_sub_ps(ix2,jx1);
1239             dy21             = _mm_sub_ps(iy2,jy1);
1240             dz21             = _mm_sub_ps(iz2,jz1);
1241             dx22             = _mm_sub_ps(ix2,jx2);
1242             dy22             = _mm_sub_ps(iy2,jy2);
1243             dz22             = _mm_sub_ps(iz2,jz2);
1244             dx23             = _mm_sub_ps(ix2,jx3);
1245             dy23             = _mm_sub_ps(iy2,jy3);
1246             dz23             = _mm_sub_ps(iz2,jz3);
1247             dx31             = _mm_sub_ps(ix3,jx1);
1248             dy31             = _mm_sub_ps(iy3,jy1);
1249             dz31             = _mm_sub_ps(iz3,jz1);
1250             dx32             = _mm_sub_ps(ix3,jx2);
1251             dy32             = _mm_sub_ps(iy3,jy2);
1252             dz32             = _mm_sub_ps(iz3,jz2);
1253             dx33             = _mm_sub_ps(ix3,jx3);
1254             dy33             = _mm_sub_ps(iy3,jy3);
1255             dz33             = _mm_sub_ps(iz3,jz3);
1256
1257             /* Calculate squared distance and things based on it */
1258             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1259             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1260             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1261             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1262             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1263             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1264             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1265             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1266             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1267             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1268
1269             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1270             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1271             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1272             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1273             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1274             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1275             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1276             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1277             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1278
1279             rinvsq00         = gmx_mm_inv_ps(rsq00);
1280             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1281             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1282             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1283             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1284             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1285             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1286             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1287             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1288             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1289
1290             fjx0             = _mm_setzero_ps();
1291             fjy0             = _mm_setzero_ps();
1292             fjz0             = _mm_setzero_ps();
1293             fjx1             = _mm_setzero_ps();
1294             fjy1             = _mm_setzero_ps();
1295             fjz1             = _mm_setzero_ps();
1296             fjx2             = _mm_setzero_ps();
1297             fjy2             = _mm_setzero_ps();
1298             fjz2             = _mm_setzero_ps();
1299             fjx3             = _mm_setzero_ps();
1300             fjy3             = _mm_setzero_ps();
1301             fjz3             = _mm_setzero_ps();
1302
1303             /**************************
1304              * CALCULATE INTERACTIONS *
1305              **************************/
1306
1307             /* LENNARD-JONES DISPERSION/REPULSION */
1308
1309             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1310             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1311
1312             fscal            = fvdw;
1313
1314             /* Calculate temporary vectorial force */
1315             tx               = _mm_mul_ps(fscal,dx00);
1316             ty               = _mm_mul_ps(fscal,dy00);
1317             tz               = _mm_mul_ps(fscal,dz00);
1318
1319             /* Update vectorial force */
1320             fix0             = _mm_add_ps(fix0,tx);
1321             fiy0             = _mm_add_ps(fiy0,ty);
1322             fiz0             = _mm_add_ps(fiz0,tz);
1323
1324             fjx0             = _mm_add_ps(fjx0,tx);
1325             fjy0             = _mm_add_ps(fjy0,ty);
1326             fjz0             = _mm_add_ps(fjz0,tz);
1327             
1328             /**************************
1329              * CALCULATE INTERACTIONS *
1330              **************************/
1331
1332             /* REACTION-FIELD ELECTROSTATICS */
1333             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1334
1335             fscal            = felec;
1336
1337             /* Calculate temporary vectorial force */
1338             tx               = _mm_mul_ps(fscal,dx11);
1339             ty               = _mm_mul_ps(fscal,dy11);
1340             tz               = _mm_mul_ps(fscal,dz11);
1341
1342             /* Update vectorial force */
1343             fix1             = _mm_add_ps(fix1,tx);
1344             fiy1             = _mm_add_ps(fiy1,ty);
1345             fiz1             = _mm_add_ps(fiz1,tz);
1346
1347             fjx1             = _mm_add_ps(fjx1,tx);
1348             fjy1             = _mm_add_ps(fjy1,ty);
1349             fjz1             = _mm_add_ps(fjz1,tz);
1350             
1351             /**************************
1352              * CALCULATE INTERACTIONS *
1353              **************************/
1354
1355             /* REACTION-FIELD ELECTROSTATICS */
1356             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1357
1358             fscal            = felec;
1359
1360             /* Calculate temporary vectorial force */
1361             tx               = _mm_mul_ps(fscal,dx12);
1362             ty               = _mm_mul_ps(fscal,dy12);
1363             tz               = _mm_mul_ps(fscal,dz12);
1364
1365             /* Update vectorial force */
1366             fix1             = _mm_add_ps(fix1,tx);
1367             fiy1             = _mm_add_ps(fiy1,ty);
1368             fiz1             = _mm_add_ps(fiz1,tz);
1369
1370             fjx2             = _mm_add_ps(fjx2,tx);
1371             fjy2             = _mm_add_ps(fjy2,ty);
1372             fjz2             = _mm_add_ps(fjz2,tz);
1373             
1374             /**************************
1375              * CALCULATE INTERACTIONS *
1376              **************************/
1377
1378             /* REACTION-FIELD ELECTROSTATICS */
1379             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1380
1381             fscal            = felec;
1382
1383             /* Calculate temporary vectorial force */
1384             tx               = _mm_mul_ps(fscal,dx13);
1385             ty               = _mm_mul_ps(fscal,dy13);
1386             tz               = _mm_mul_ps(fscal,dz13);
1387
1388             /* Update vectorial force */
1389             fix1             = _mm_add_ps(fix1,tx);
1390             fiy1             = _mm_add_ps(fiy1,ty);
1391             fiz1             = _mm_add_ps(fiz1,tz);
1392
1393             fjx3             = _mm_add_ps(fjx3,tx);
1394             fjy3             = _mm_add_ps(fjy3,ty);
1395             fjz3             = _mm_add_ps(fjz3,tz);
1396             
1397             /**************************
1398              * CALCULATE INTERACTIONS *
1399              **************************/
1400
1401             /* REACTION-FIELD ELECTROSTATICS */
1402             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1403
1404             fscal            = felec;
1405
1406             /* Calculate temporary vectorial force */
1407             tx               = _mm_mul_ps(fscal,dx21);
1408             ty               = _mm_mul_ps(fscal,dy21);
1409             tz               = _mm_mul_ps(fscal,dz21);
1410
1411             /* Update vectorial force */
1412             fix2             = _mm_add_ps(fix2,tx);
1413             fiy2             = _mm_add_ps(fiy2,ty);
1414             fiz2             = _mm_add_ps(fiz2,tz);
1415
1416             fjx1             = _mm_add_ps(fjx1,tx);
1417             fjy1             = _mm_add_ps(fjy1,ty);
1418             fjz1             = _mm_add_ps(fjz1,tz);
1419             
1420             /**************************
1421              * CALCULATE INTERACTIONS *
1422              **************************/
1423
1424             /* REACTION-FIELD ELECTROSTATICS */
1425             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1426
1427             fscal            = felec;
1428
1429             /* Calculate temporary vectorial force */
1430             tx               = _mm_mul_ps(fscal,dx22);
1431             ty               = _mm_mul_ps(fscal,dy22);
1432             tz               = _mm_mul_ps(fscal,dz22);
1433
1434             /* Update vectorial force */
1435             fix2             = _mm_add_ps(fix2,tx);
1436             fiy2             = _mm_add_ps(fiy2,ty);
1437             fiz2             = _mm_add_ps(fiz2,tz);
1438
1439             fjx2             = _mm_add_ps(fjx2,tx);
1440             fjy2             = _mm_add_ps(fjy2,ty);
1441             fjz2             = _mm_add_ps(fjz2,tz);
1442             
1443             /**************************
1444              * CALCULATE INTERACTIONS *
1445              **************************/
1446
1447             /* REACTION-FIELD ELECTROSTATICS */
1448             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1449
1450             fscal            = felec;
1451
1452             /* Calculate temporary vectorial force */
1453             tx               = _mm_mul_ps(fscal,dx23);
1454             ty               = _mm_mul_ps(fscal,dy23);
1455             tz               = _mm_mul_ps(fscal,dz23);
1456
1457             /* Update vectorial force */
1458             fix2             = _mm_add_ps(fix2,tx);
1459             fiy2             = _mm_add_ps(fiy2,ty);
1460             fiz2             = _mm_add_ps(fiz2,tz);
1461
1462             fjx3             = _mm_add_ps(fjx3,tx);
1463             fjy3             = _mm_add_ps(fjy3,ty);
1464             fjz3             = _mm_add_ps(fjz3,tz);
1465             
1466             /**************************
1467              * CALCULATE INTERACTIONS *
1468              **************************/
1469
1470             /* REACTION-FIELD ELECTROSTATICS */
1471             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1472
1473             fscal            = felec;
1474
1475             /* Calculate temporary vectorial force */
1476             tx               = _mm_mul_ps(fscal,dx31);
1477             ty               = _mm_mul_ps(fscal,dy31);
1478             tz               = _mm_mul_ps(fscal,dz31);
1479
1480             /* Update vectorial force */
1481             fix3             = _mm_add_ps(fix3,tx);
1482             fiy3             = _mm_add_ps(fiy3,ty);
1483             fiz3             = _mm_add_ps(fiz3,tz);
1484
1485             fjx1             = _mm_add_ps(fjx1,tx);
1486             fjy1             = _mm_add_ps(fjy1,ty);
1487             fjz1             = _mm_add_ps(fjz1,tz);
1488             
1489             /**************************
1490              * CALCULATE INTERACTIONS *
1491              **************************/
1492
1493             /* REACTION-FIELD ELECTROSTATICS */
1494             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1495
1496             fscal            = felec;
1497
1498             /* Calculate temporary vectorial force */
1499             tx               = _mm_mul_ps(fscal,dx32);
1500             ty               = _mm_mul_ps(fscal,dy32);
1501             tz               = _mm_mul_ps(fscal,dz32);
1502
1503             /* Update vectorial force */
1504             fix3             = _mm_add_ps(fix3,tx);
1505             fiy3             = _mm_add_ps(fiy3,ty);
1506             fiz3             = _mm_add_ps(fiz3,tz);
1507
1508             fjx2             = _mm_add_ps(fjx2,tx);
1509             fjy2             = _mm_add_ps(fjy2,ty);
1510             fjz2             = _mm_add_ps(fjz2,tz);
1511             
1512             /**************************
1513              * CALCULATE INTERACTIONS *
1514              **************************/
1515
1516             /* REACTION-FIELD ELECTROSTATICS */
1517             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1518
1519             fscal            = felec;
1520
1521             /* Calculate temporary vectorial force */
1522             tx               = _mm_mul_ps(fscal,dx33);
1523             ty               = _mm_mul_ps(fscal,dy33);
1524             tz               = _mm_mul_ps(fscal,dz33);
1525
1526             /* Update vectorial force */
1527             fix3             = _mm_add_ps(fix3,tx);
1528             fiy3             = _mm_add_ps(fiy3,ty);
1529             fiz3             = _mm_add_ps(fiz3,tz);
1530
1531             fjx3             = _mm_add_ps(fjx3,tx);
1532             fjy3             = _mm_add_ps(fjy3,ty);
1533             fjz3             = _mm_add_ps(fjz3,tz);
1534             
1535             fjptrA             = f+j_coord_offsetA;
1536             fjptrB             = f+j_coord_offsetB;
1537             fjptrC             = f+j_coord_offsetC;
1538             fjptrD             = f+j_coord_offsetD;
1539
1540             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1541                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1542                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1543
1544             /* Inner loop uses 273 flops */
1545         }
1546
1547         if(jidx<j_index_end)
1548         {
1549
1550             /* Get j neighbor index, and coordinate index */
1551             jnrlistA         = jjnr[jidx];
1552             jnrlistB         = jjnr[jidx+1];
1553             jnrlistC         = jjnr[jidx+2];
1554             jnrlistD         = jjnr[jidx+3];
1555             /* Sign of each element will be negative for non-real atoms.
1556              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1557              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1558              */
1559             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1560             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1561             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1562             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1563             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1564             j_coord_offsetA  = DIM*jnrA;
1565             j_coord_offsetB  = DIM*jnrB;
1566             j_coord_offsetC  = DIM*jnrC;
1567             j_coord_offsetD  = DIM*jnrD;
1568
1569             /* load j atom coordinates */
1570             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1571                                               x+j_coord_offsetC,x+j_coord_offsetD,
1572                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1573                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1574
1575             /* Calculate displacement vector */
1576             dx00             = _mm_sub_ps(ix0,jx0);
1577             dy00             = _mm_sub_ps(iy0,jy0);
1578             dz00             = _mm_sub_ps(iz0,jz0);
1579             dx11             = _mm_sub_ps(ix1,jx1);
1580             dy11             = _mm_sub_ps(iy1,jy1);
1581             dz11             = _mm_sub_ps(iz1,jz1);
1582             dx12             = _mm_sub_ps(ix1,jx2);
1583             dy12             = _mm_sub_ps(iy1,jy2);
1584             dz12             = _mm_sub_ps(iz1,jz2);
1585             dx13             = _mm_sub_ps(ix1,jx3);
1586             dy13             = _mm_sub_ps(iy1,jy3);
1587             dz13             = _mm_sub_ps(iz1,jz3);
1588             dx21             = _mm_sub_ps(ix2,jx1);
1589             dy21             = _mm_sub_ps(iy2,jy1);
1590             dz21             = _mm_sub_ps(iz2,jz1);
1591             dx22             = _mm_sub_ps(ix2,jx2);
1592             dy22             = _mm_sub_ps(iy2,jy2);
1593             dz22             = _mm_sub_ps(iz2,jz2);
1594             dx23             = _mm_sub_ps(ix2,jx3);
1595             dy23             = _mm_sub_ps(iy2,jy3);
1596             dz23             = _mm_sub_ps(iz2,jz3);
1597             dx31             = _mm_sub_ps(ix3,jx1);
1598             dy31             = _mm_sub_ps(iy3,jy1);
1599             dz31             = _mm_sub_ps(iz3,jz1);
1600             dx32             = _mm_sub_ps(ix3,jx2);
1601             dy32             = _mm_sub_ps(iy3,jy2);
1602             dz32             = _mm_sub_ps(iz3,jz2);
1603             dx33             = _mm_sub_ps(ix3,jx3);
1604             dy33             = _mm_sub_ps(iy3,jy3);
1605             dz33             = _mm_sub_ps(iz3,jz3);
1606
1607             /* Calculate squared distance and things based on it */
1608             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1609             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1610             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1611             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1612             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1613             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1614             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1615             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1616             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1617             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1618
1619             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1620             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1621             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1622             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1623             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1624             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1625             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1626             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1627             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1628
1629             rinvsq00         = gmx_mm_inv_ps(rsq00);
1630             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1631             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1632             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1633             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1634             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1635             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1636             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1637             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1638             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1639
1640             fjx0             = _mm_setzero_ps();
1641             fjy0             = _mm_setzero_ps();
1642             fjz0             = _mm_setzero_ps();
1643             fjx1             = _mm_setzero_ps();
1644             fjy1             = _mm_setzero_ps();
1645             fjz1             = _mm_setzero_ps();
1646             fjx2             = _mm_setzero_ps();
1647             fjy2             = _mm_setzero_ps();
1648             fjz2             = _mm_setzero_ps();
1649             fjx3             = _mm_setzero_ps();
1650             fjy3             = _mm_setzero_ps();
1651             fjz3             = _mm_setzero_ps();
1652
1653             /**************************
1654              * CALCULATE INTERACTIONS *
1655              **************************/
1656
1657             /* LENNARD-JONES DISPERSION/REPULSION */
1658
1659             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1660             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1661
1662             fscal            = fvdw;
1663
1664             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1665
1666             /* Calculate temporary vectorial force */
1667             tx               = _mm_mul_ps(fscal,dx00);
1668             ty               = _mm_mul_ps(fscal,dy00);
1669             tz               = _mm_mul_ps(fscal,dz00);
1670
1671             /* Update vectorial force */
1672             fix0             = _mm_add_ps(fix0,tx);
1673             fiy0             = _mm_add_ps(fiy0,ty);
1674             fiz0             = _mm_add_ps(fiz0,tz);
1675
1676             fjx0             = _mm_add_ps(fjx0,tx);
1677             fjy0             = _mm_add_ps(fjy0,ty);
1678             fjz0             = _mm_add_ps(fjz0,tz);
1679             
1680             /**************************
1681              * CALCULATE INTERACTIONS *
1682              **************************/
1683
1684             /* REACTION-FIELD ELECTROSTATICS */
1685             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1686
1687             fscal            = felec;
1688
1689             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1690
1691             /* Calculate temporary vectorial force */
1692             tx               = _mm_mul_ps(fscal,dx11);
1693             ty               = _mm_mul_ps(fscal,dy11);
1694             tz               = _mm_mul_ps(fscal,dz11);
1695
1696             /* Update vectorial force */
1697             fix1             = _mm_add_ps(fix1,tx);
1698             fiy1             = _mm_add_ps(fiy1,ty);
1699             fiz1             = _mm_add_ps(fiz1,tz);
1700
1701             fjx1             = _mm_add_ps(fjx1,tx);
1702             fjy1             = _mm_add_ps(fjy1,ty);
1703             fjz1             = _mm_add_ps(fjz1,tz);
1704             
1705             /**************************
1706              * CALCULATE INTERACTIONS *
1707              **************************/
1708
1709             /* REACTION-FIELD ELECTROSTATICS */
1710             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1711
1712             fscal            = felec;
1713
1714             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1715
1716             /* Calculate temporary vectorial force */
1717             tx               = _mm_mul_ps(fscal,dx12);
1718             ty               = _mm_mul_ps(fscal,dy12);
1719             tz               = _mm_mul_ps(fscal,dz12);
1720
1721             /* Update vectorial force */
1722             fix1             = _mm_add_ps(fix1,tx);
1723             fiy1             = _mm_add_ps(fiy1,ty);
1724             fiz1             = _mm_add_ps(fiz1,tz);
1725
1726             fjx2             = _mm_add_ps(fjx2,tx);
1727             fjy2             = _mm_add_ps(fjy2,ty);
1728             fjz2             = _mm_add_ps(fjz2,tz);
1729             
1730             /**************************
1731              * CALCULATE INTERACTIONS *
1732              **************************/
1733
1734             /* REACTION-FIELD ELECTROSTATICS */
1735             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1736
1737             fscal            = felec;
1738
1739             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1740
1741             /* Calculate temporary vectorial force */
1742             tx               = _mm_mul_ps(fscal,dx13);
1743             ty               = _mm_mul_ps(fscal,dy13);
1744             tz               = _mm_mul_ps(fscal,dz13);
1745
1746             /* Update vectorial force */
1747             fix1             = _mm_add_ps(fix1,tx);
1748             fiy1             = _mm_add_ps(fiy1,ty);
1749             fiz1             = _mm_add_ps(fiz1,tz);
1750
1751             fjx3             = _mm_add_ps(fjx3,tx);
1752             fjy3             = _mm_add_ps(fjy3,ty);
1753             fjz3             = _mm_add_ps(fjz3,tz);
1754             
1755             /**************************
1756              * CALCULATE INTERACTIONS *
1757              **************************/
1758
1759             /* REACTION-FIELD ELECTROSTATICS */
1760             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1761
1762             fscal            = felec;
1763
1764             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1765
1766             /* Calculate temporary vectorial force */
1767             tx               = _mm_mul_ps(fscal,dx21);
1768             ty               = _mm_mul_ps(fscal,dy21);
1769             tz               = _mm_mul_ps(fscal,dz21);
1770
1771             /* Update vectorial force */
1772             fix2             = _mm_add_ps(fix2,tx);
1773             fiy2             = _mm_add_ps(fiy2,ty);
1774             fiz2             = _mm_add_ps(fiz2,tz);
1775
1776             fjx1             = _mm_add_ps(fjx1,tx);
1777             fjy1             = _mm_add_ps(fjy1,ty);
1778             fjz1             = _mm_add_ps(fjz1,tz);
1779             
1780             /**************************
1781              * CALCULATE INTERACTIONS *
1782              **************************/
1783
1784             /* REACTION-FIELD ELECTROSTATICS */
1785             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1786
1787             fscal            = felec;
1788
1789             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1790
1791             /* Calculate temporary vectorial force */
1792             tx               = _mm_mul_ps(fscal,dx22);
1793             ty               = _mm_mul_ps(fscal,dy22);
1794             tz               = _mm_mul_ps(fscal,dz22);
1795
1796             /* Update vectorial force */
1797             fix2             = _mm_add_ps(fix2,tx);
1798             fiy2             = _mm_add_ps(fiy2,ty);
1799             fiz2             = _mm_add_ps(fiz2,tz);
1800
1801             fjx2             = _mm_add_ps(fjx2,tx);
1802             fjy2             = _mm_add_ps(fjy2,ty);
1803             fjz2             = _mm_add_ps(fjz2,tz);
1804             
1805             /**************************
1806              * CALCULATE INTERACTIONS *
1807              **************************/
1808
1809             /* REACTION-FIELD ELECTROSTATICS */
1810             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1811
1812             fscal            = felec;
1813
1814             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1815
1816             /* Calculate temporary vectorial force */
1817             tx               = _mm_mul_ps(fscal,dx23);
1818             ty               = _mm_mul_ps(fscal,dy23);
1819             tz               = _mm_mul_ps(fscal,dz23);
1820
1821             /* Update vectorial force */
1822             fix2             = _mm_add_ps(fix2,tx);
1823             fiy2             = _mm_add_ps(fiy2,ty);
1824             fiz2             = _mm_add_ps(fiz2,tz);
1825
1826             fjx3             = _mm_add_ps(fjx3,tx);
1827             fjy3             = _mm_add_ps(fjy3,ty);
1828             fjz3             = _mm_add_ps(fjz3,tz);
1829             
1830             /**************************
1831              * CALCULATE INTERACTIONS *
1832              **************************/
1833
1834             /* REACTION-FIELD ELECTROSTATICS */
1835             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1836
1837             fscal            = felec;
1838
1839             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1840
1841             /* Calculate temporary vectorial force */
1842             tx               = _mm_mul_ps(fscal,dx31);
1843             ty               = _mm_mul_ps(fscal,dy31);
1844             tz               = _mm_mul_ps(fscal,dz31);
1845
1846             /* Update vectorial force */
1847             fix3             = _mm_add_ps(fix3,tx);
1848             fiy3             = _mm_add_ps(fiy3,ty);
1849             fiz3             = _mm_add_ps(fiz3,tz);
1850
1851             fjx1             = _mm_add_ps(fjx1,tx);
1852             fjy1             = _mm_add_ps(fjy1,ty);
1853             fjz1             = _mm_add_ps(fjz1,tz);
1854             
1855             /**************************
1856              * CALCULATE INTERACTIONS *
1857              **************************/
1858
1859             /* REACTION-FIELD ELECTROSTATICS */
1860             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1861
1862             fscal            = felec;
1863
1864             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1865
1866             /* Calculate temporary vectorial force */
1867             tx               = _mm_mul_ps(fscal,dx32);
1868             ty               = _mm_mul_ps(fscal,dy32);
1869             tz               = _mm_mul_ps(fscal,dz32);
1870
1871             /* Update vectorial force */
1872             fix3             = _mm_add_ps(fix3,tx);
1873             fiy3             = _mm_add_ps(fiy3,ty);
1874             fiz3             = _mm_add_ps(fiz3,tz);
1875
1876             fjx2             = _mm_add_ps(fjx2,tx);
1877             fjy2             = _mm_add_ps(fjy2,ty);
1878             fjz2             = _mm_add_ps(fjz2,tz);
1879             
1880             /**************************
1881              * CALCULATE INTERACTIONS *
1882              **************************/
1883
1884             /* REACTION-FIELD ELECTROSTATICS */
1885             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1886
1887             fscal            = felec;
1888
1889             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1890
1891             /* Calculate temporary vectorial force */
1892             tx               = _mm_mul_ps(fscal,dx33);
1893             ty               = _mm_mul_ps(fscal,dy33);
1894             tz               = _mm_mul_ps(fscal,dz33);
1895
1896             /* Update vectorial force */
1897             fix3             = _mm_add_ps(fix3,tx);
1898             fiy3             = _mm_add_ps(fiy3,ty);
1899             fiz3             = _mm_add_ps(fiz3,tz);
1900
1901             fjx3             = _mm_add_ps(fjx3,tx);
1902             fjy3             = _mm_add_ps(fjy3,ty);
1903             fjz3             = _mm_add_ps(fjz3,tz);
1904             
1905             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1906             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1907             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1908             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1909
1910             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1911                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1912                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1913
1914             /* Inner loop uses 273 flops */
1915         }
1916
1917         /* End of innermost loop */
1918
1919         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1920                                               f+i_coord_offset,fshift+i_shift_offset);
1921
1922         /* Increment number of inner iterations */
1923         inneriter                  += j_index_end - j_index_start;
1924
1925         /* Outer loop uses 24 flops */
1926     }
1927
1928     /* Increment number of outer iterations */
1929     outeriter        += nri;
1930
1931     /* Update outer/inner flops */
1932
1933     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);
1934 }