Remove no-inline-max-size and suppress remark
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse4_1_single
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            None
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse4_1_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
96     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m128           dummy_mask,cutoff_mask;
109     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
110     __m128           one     = _mm_set1_ps(1.0);
111     __m128           two     = _mm_set1_ps(2.0);
112     x                = xx[0];
113     f                = ff[0];
114
115     nri              = nlist->nri;
116     iinr             = nlist->iinr;
117     jindex           = nlist->jindex;
118     jjnr             = nlist->jjnr;
119     shiftidx         = nlist->shift;
120     gid              = nlist->gid;
121     shiftvec         = fr->shift_vec[0];
122     fshift           = fr->fshift[0];
123     facel            = _mm_set1_ps(fr->epsfac);
124     charge           = mdatoms->chargeA;
125     krf              = _mm_set1_ps(fr->ic->k_rf);
126     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
127     crf              = _mm_set1_ps(fr->ic->c_rf);
128
129     /* Setup water-specific parameters */
130     inr              = nlist->iinr[0];
131     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
132     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
133     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
134
135     jq1              = _mm_set1_ps(charge[inr+1]);
136     jq2              = _mm_set1_ps(charge[inr+2]);
137     jq3              = _mm_set1_ps(charge[inr+3]);
138     qq11             = _mm_mul_ps(iq1,jq1);
139     qq12             = _mm_mul_ps(iq1,jq2);
140     qq13             = _mm_mul_ps(iq1,jq3);
141     qq21             = _mm_mul_ps(iq2,jq1);
142     qq22             = _mm_mul_ps(iq2,jq2);
143     qq23             = _mm_mul_ps(iq2,jq3);
144     qq31             = _mm_mul_ps(iq3,jq1);
145     qq32             = _mm_mul_ps(iq3,jq2);
146     qq33             = _mm_mul_ps(iq3,jq3);
147
148     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
149     rcutoff_scalar   = fr->rcoulomb;
150     rcutoff          = _mm_set1_ps(rcutoff_scalar);
151     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
152
153     /* Avoid stupid compiler warnings */
154     jnrA = jnrB = jnrC = jnrD = 0;
155     j_coord_offsetA = 0;
156     j_coord_offsetB = 0;
157     j_coord_offsetC = 0;
158     j_coord_offsetD = 0;
159
160     outeriter        = 0;
161     inneriter        = 0;
162
163     for(iidx=0;iidx<4*DIM;iidx++)
164     {
165         scratch[iidx] = 0.0;
166     }
167
168     /* Start outer loop over neighborlists */
169     for(iidx=0; iidx<nri; iidx++)
170     {
171         /* Load shift vector for this list */
172         i_shift_offset   = DIM*shiftidx[iidx];
173
174         /* Load limits for loop over neighbors */
175         j_index_start    = jindex[iidx];
176         j_index_end      = jindex[iidx+1];
177
178         /* Get outer coordinate index */
179         inr              = iinr[iidx];
180         i_coord_offset   = DIM*inr;
181
182         /* Load i particle coords and add shift vector */
183         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
184                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
185
186         fix1             = _mm_setzero_ps();
187         fiy1             = _mm_setzero_ps();
188         fiz1             = _mm_setzero_ps();
189         fix2             = _mm_setzero_ps();
190         fiy2             = _mm_setzero_ps();
191         fiz2             = _mm_setzero_ps();
192         fix3             = _mm_setzero_ps();
193         fiy3             = _mm_setzero_ps();
194         fiz3             = _mm_setzero_ps();
195
196         /* Reset potential sums */
197         velecsum         = _mm_setzero_ps();
198
199         /* Start inner kernel loop */
200         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201         {
202
203             /* Get j neighbor index, and coordinate index */
204             jnrA             = jjnr[jidx];
205             jnrB             = jjnr[jidx+1];
206             jnrC             = jjnr[jidx+2];
207             jnrD             = jjnr[jidx+3];
208             j_coord_offsetA  = DIM*jnrA;
209             j_coord_offsetB  = DIM*jnrB;
210             j_coord_offsetC  = DIM*jnrC;
211             j_coord_offsetD  = DIM*jnrD;
212
213             /* load j atom coordinates */
214             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
215                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
216                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
217
218             /* Calculate displacement vector */
219             dx11             = _mm_sub_ps(ix1,jx1);
220             dy11             = _mm_sub_ps(iy1,jy1);
221             dz11             = _mm_sub_ps(iz1,jz1);
222             dx12             = _mm_sub_ps(ix1,jx2);
223             dy12             = _mm_sub_ps(iy1,jy2);
224             dz12             = _mm_sub_ps(iz1,jz2);
225             dx13             = _mm_sub_ps(ix1,jx3);
226             dy13             = _mm_sub_ps(iy1,jy3);
227             dz13             = _mm_sub_ps(iz1,jz3);
228             dx21             = _mm_sub_ps(ix2,jx1);
229             dy21             = _mm_sub_ps(iy2,jy1);
230             dz21             = _mm_sub_ps(iz2,jz1);
231             dx22             = _mm_sub_ps(ix2,jx2);
232             dy22             = _mm_sub_ps(iy2,jy2);
233             dz22             = _mm_sub_ps(iz2,jz2);
234             dx23             = _mm_sub_ps(ix2,jx3);
235             dy23             = _mm_sub_ps(iy2,jy3);
236             dz23             = _mm_sub_ps(iz2,jz3);
237             dx31             = _mm_sub_ps(ix3,jx1);
238             dy31             = _mm_sub_ps(iy3,jy1);
239             dz31             = _mm_sub_ps(iz3,jz1);
240             dx32             = _mm_sub_ps(ix3,jx2);
241             dy32             = _mm_sub_ps(iy3,jy2);
242             dz32             = _mm_sub_ps(iz3,jz2);
243             dx33             = _mm_sub_ps(ix3,jx3);
244             dy33             = _mm_sub_ps(iy3,jy3);
245             dz33             = _mm_sub_ps(iz3,jz3);
246
247             /* Calculate squared distance and things based on it */
248             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
249             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
250             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
251             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
252             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
253             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
254             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
255             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
256             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
257
258             rinv11           = gmx_mm_invsqrt_ps(rsq11);
259             rinv12           = gmx_mm_invsqrt_ps(rsq12);
260             rinv13           = gmx_mm_invsqrt_ps(rsq13);
261             rinv21           = gmx_mm_invsqrt_ps(rsq21);
262             rinv22           = gmx_mm_invsqrt_ps(rsq22);
263             rinv23           = gmx_mm_invsqrt_ps(rsq23);
264             rinv31           = gmx_mm_invsqrt_ps(rsq31);
265             rinv32           = gmx_mm_invsqrt_ps(rsq32);
266             rinv33           = gmx_mm_invsqrt_ps(rsq33);
267
268             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
269             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
270             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
271             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
272             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
273             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
274             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
275             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
276             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
277
278             fjx1             = _mm_setzero_ps();
279             fjy1             = _mm_setzero_ps();
280             fjz1             = _mm_setzero_ps();
281             fjx2             = _mm_setzero_ps();
282             fjy2             = _mm_setzero_ps();
283             fjz2             = _mm_setzero_ps();
284             fjx3             = _mm_setzero_ps();
285             fjy3             = _mm_setzero_ps();
286             fjz3             = _mm_setzero_ps();
287
288             /**************************
289              * CALCULATE INTERACTIONS *
290              **************************/
291
292             if (gmx_mm_any_lt(rsq11,rcutoff2))
293             {
294
295             /* REACTION-FIELD ELECTROSTATICS */
296             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
297             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
298
299             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
300
301             /* Update potential sum for this i atom from the interaction with this j atom. */
302             velec            = _mm_and_ps(velec,cutoff_mask);
303             velecsum         = _mm_add_ps(velecsum,velec);
304
305             fscal            = felec;
306
307             fscal            = _mm_and_ps(fscal,cutoff_mask);
308
309             /* Calculate temporary vectorial force */
310             tx               = _mm_mul_ps(fscal,dx11);
311             ty               = _mm_mul_ps(fscal,dy11);
312             tz               = _mm_mul_ps(fscal,dz11);
313
314             /* Update vectorial force */
315             fix1             = _mm_add_ps(fix1,tx);
316             fiy1             = _mm_add_ps(fiy1,ty);
317             fiz1             = _mm_add_ps(fiz1,tz);
318
319             fjx1             = _mm_add_ps(fjx1,tx);
320             fjy1             = _mm_add_ps(fjy1,ty);
321             fjz1             = _mm_add_ps(fjz1,tz);
322
323             }
324
325             /**************************
326              * CALCULATE INTERACTIONS *
327              **************************/
328
329             if (gmx_mm_any_lt(rsq12,rcutoff2))
330             {
331
332             /* REACTION-FIELD ELECTROSTATICS */
333             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
334             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
335
336             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
337
338             /* Update potential sum for this i atom from the interaction with this j atom. */
339             velec            = _mm_and_ps(velec,cutoff_mask);
340             velecsum         = _mm_add_ps(velecsum,velec);
341
342             fscal            = felec;
343
344             fscal            = _mm_and_ps(fscal,cutoff_mask);
345
346             /* Calculate temporary vectorial force */
347             tx               = _mm_mul_ps(fscal,dx12);
348             ty               = _mm_mul_ps(fscal,dy12);
349             tz               = _mm_mul_ps(fscal,dz12);
350
351             /* Update vectorial force */
352             fix1             = _mm_add_ps(fix1,tx);
353             fiy1             = _mm_add_ps(fiy1,ty);
354             fiz1             = _mm_add_ps(fiz1,tz);
355
356             fjx2             = _mm_add_ps(fjx2,tx);
357             fjy2             = _mm_add_ps(fjy2,ty);
358             fjz2             = _mm_add_ps(fjz2,tz);
359
360             }
361
362             /**************************
363              * CALCULATE INTERACTIONS *
364              **************************/
365
366             if (gmx_mm_any_lt(rsq13,rcutoff2))
367             {
368
369             /* REACTION-FIELD ELECTROSTATICS */
370             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
371             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
372
373             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
374
375             /* Update potential sum for this i atom from the interaction with this j atom. */
376             velec            = _mm_and_ps(velec,cutoff_mask);
377             velecsum         = _mm_add_ps(velecsum,velec);
378
379             fscal            = felec;
380
381             fscal            = _mm_and_ps(fscal,cutoff_mask);
382
383             /* Calculate temporary vectorial force */
384             tx               = _mm_mul_ps(fscal,dx13);
385             ty               = _mm_mul_ps(fscal,dy13);
386             tz               = _mm_mul_ps(fscal,dz13);
387
388             /* Update vectorial force */
389             fix1             = _mm_add_ps(fix1,tx);
390             fiy1             = _mm_add_ps(fiy1,ty);
391             fiz1             = _mm_add_ps(fiz1,tz);
392
393             fjx3             = _mm_add_ps(fjx3,tx);
394             fjy3             = _mm_add_ps(fjy3,ty);
395             fjz3             = _mm_add_ps(fjz3,tz);
396
397             }
398
399             /**************************
400              * CALCULATE INTERACTIONS *
401              **************************/
402
403             if (gmx_mm_any_lt(rsq21,rcutoff2))
404             {
405
406             /* REACTION-FIELD ELECTROSTATICS */
407             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
408             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
409
410             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
411
412             /* Update potential sum for this i atom from the interaction with this j atom. */
413             velec            = _mm_and_ps(velec,cutoff_mask);
414             velecsum         = _mm_add_ps(velecsum,velec);
415
416             fscal            = felec;
417
418             fscal            = _mm_and_ps(fscal,cutoff_mask);
419
420             /* Calculate temporary vectorial force */
421             tx               = _mm_mul_ps(fscal,dx21);
422             ty               = _mm_mul_ps(fscal,dy21);
423             tz               = _mm_mul_ps(fscal,dz21);
424
425             /* Update vectorial force */
426             fix2             = _mm_add_ps(fix2,tx);
427             fiy2             = _mm_add_ps(fiy2,ty);
428             fiz2             = _mm_add_ps(fiz2,tz);
429
430             fjx1             = _mm_add_ps(fjx1,tx);
431             fjy1             = _mm_add_ps(fjy1,ty);
432             fjz1             = _mm_add_ps(fjz1,tz);
433
434             }
435
436             /**************************
437              * CALCULATE INTERACTIONS *
438              **************************/
439
440             if (gmx_mm_any_lt(rsq22,rcutoff2))
441             {
442
443             /* REACTION-FIELD ELECTROSTATICS */
444             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
445             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
446
447             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
448
449             /* Update potential sum for this i atom from the interaction with this j atom. */
450             velec            = _mm_and_ps(velec,cutoff_mask);
451             velecsum         = _mm_add_ps(velecsum,velec);
452
453             fscal            = felec;
454
455             fscal            = _mm_and_ps(fscal,cutoff_mask);
456
457             /* Calculate temporary vectorial force */
458             tx               = _mm_mul_ps(fscal,dx22);
459             ty               = _mm_mul_ps(fscal,dy22);
460             tz               = _mm_mul_ps(fscal,dz22);
461
462             /* Update vectorial force */
463             fix2             = _mm_add_ps(fix2,tx);
464             fiy2             = _mm_add_ps(fiy2,ty);
465             fiz2             = _mm_add_ps(fiz2,tz);
466
467             fjx2             = _mm_add_ps(fjx2,tx);
468             fjy2             = _mm_add_ps(fjy2,ty);
469             fjz2             = _mm_add_ps(fjz2,tz);
470
471             }
472
473             /**************************
474              * CALCULATE INTERACTIONS *
475              **************************/
476
477             if (gmx_mm_any_lt(rsq23,rcutoff2))
478             {
479
480             /* REACTION-FIELD ELECTROSTATICS */
481             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
482             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
483
484             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
485
486             /* Update potential sum for this i atom from the interaction with this j atom. */
487             velec            = _mm_and_ps(velec,cutoff_mask);
488             velecsum         = _mm_add_ps(velecsum,velec);
489
490             fscal            = felec;
491
492             fscal            = _mm_and_ps(fscal,cutoff_mask);
493
494             /* Calculate temporary vectorial force */
495             tx               = _mm_mul_ps(fscal,dx23);
496             ty               = _mm_mul_ps(fscal,dy23);
497             tz               = _mm_mul_ps(fscal,dz23);
498
499             /* Update vectorial force */
500             fix2             = _mm_add_ps(fix2,tx);
501             fiy2             = _mm_add_ps(fiy2,ty);
502             fiz2             = _mm_add_ps(fiz2,tz);
503
504             fjx3             = _mm_add_ps(fjx3,tx);
505             fjy3             = _mm_add_ps(fjy3,ty);
506             fjz3             = _mm_add_ps(fjz3,tz);
507
508             }
509
510             /**************************
511              * CALCULATE INTERACTIONS *
512              **************************/
513
514             if (gmx_mm_any_lt(rsq31,rcutoff2))
515             {
516
517             /* REACTION-FIELD ELECTROSTATICS */
518             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
519             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
520
521             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
522
523             /* Update potential sum for this i atom from the interaction with this j atom. */
524             velec            = _mm_and_ps(velec,cutoff_mask);
525             velecsum         = _mm_add_ps(velecsum,velec);
526
527             fscal            = felec;
528
529             fscal            = _mm_and_ps(fscal,cutoff_mask);
530
531             /* Calculate temporary vectorial force */
532             tx               = _mm_mul_ps(fscal,dx31);
533             ty               = _mm_mul_ps(fscal,dy31);
534             tz               = _mm_mul_ps(fscal,dz31);
535
536             /* Update vectorial force */
537             fix3             = _mm_add_ps(fix3,tx);
538             fiy3             = _mm_add_ps(fiy3,ty);
539             fiz3             = _mm_add_ps(fiz3,tz);
540
541             fjx1             = _mm_add_ps(fjx1,tx);
542             fjy1             = _mm_add_ps(fjy1,ty);
543             fjz1             = _mm_add_ps(fjz1,tz);
544
545             }
546
547             /**************************
548              * CALCULATE INTERACTIONS *
549              **************************/
550
551             if (gmx_mm_any_lt(rsq32,rcutoff2))
552             {
553
554             /* REACTION-FIELD ELECTROSTATICS */
555             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
556             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
557
558             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
559
560             /* Update potential sum for this i atom from the interaction with this j atom. */
561             velec            = _mm_and_ps(velec,cutoff_mask);
562             velecsum         = _mm_add_ps(velecsum,velec);
563
564             fscal            = felec;
565
566             fscal            = _mm_and_ps(fscal,cutoff_mask);
567
568             /* Calculate temporary vectorial force */
569             tx               = _mm_mul_ps(fscal,dx32);
570             ty               = _mm_mul_ps(fscal,dy32);
571             tz               = _mm_mul_ps(fscal,dz32);
572
573             /* Update vectorial force */
574             fix3             = _mm_add_ps(fix3,tx);
575             fiy3             = _mm_add_ps(fiy3,ty);
576             fiz3             = _mm_add_ps(fiz3,tz);
577
578             fjx2             = _mm_add_ps(fjx2,tx);
579             fjy2             = _mm_add_ps(fjy2,ty);
580             fjz2             = _mm_add_ps(fjz2,tz);
581
582             }
583
584             /**************************
585              * CALCULATE INTERACTIONS *
586              **************************/
587
588             if (gmx_mm_any_lt(rsq33,rcutoff2))
589             {
590
591             /* REACTION-FIELD ELECTROSTATICS */
592             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
593             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
594
595             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
596
597             /* Update potential sum for this i atom from the interaction with this j atom. */
598             velec            = _mm_and_ps(velec,cutoff_mask);
599             velecsum         = _mm_add_ps(velecsum,velec);
600
601             fscal            = felec;
602
603             fscal            = _mm_and_ps(fscal,cutoff_mask);
604
605             /* Calculate temporary vectorial force */
606             tx               = _mm_mul_ps(fscal,dx33);
607             ty               = _mm_mul_ps(fscal,dy33);
608             tz               = _mm_mul_ps(fscal,dz33);
609
610             /* Update vectorial force */
611             fix3             = _mm_add_ps(fix3,tx);
612             fiy3             = _mm_add_ps(fiy3,ty);
613             fiz3             = _mm_add_ps(fiz3,tz);
614
615             fjx3             = _mm_add_ps(fjx3,tx);
616             fjy3             = _mm_add_ps(fjy3,ty);
617             fjz3             = _mm_add_ps(fjz3,tz);
618
619             }
620
621             fjptrA             = f+j_coord_offsetA;
622             fjptrB             = f+j_coord_offsetB;
623             fjptrC             = f+j_coord_offsetC;
624             fjptrD             = f+j_coord_offsetD;
625
626             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
627                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
628
629             /* Inner loop uses 324 flops */
630         }
631
632         if(jidx<j_index_end)
633         {
634
635             /* Get j neighbor index, and coordinate index */
636             jnrlistA         = jjnr[jidx];
637             jnrlistB         = jjnr[jidx+1];
638             jnrlistC         = jjnr[jidx+2];
639             jnrlistD         = jjnr[jidx+3];
640             /* Sign of each element will be negative for non-real atoms.
641              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
642              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
643              */
644             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
645             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
646             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
647             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
648             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
649             j_coord_offsetA  = DIM*jnrA;
650             j_coord_offsetB  = DIM*jnrB;
651             j_coord_offsetC  = DIM*jnrC;
652             j_coord_offsetD  = DIM*jnrD;
653
654             /* load j atom coordinates */
655             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
656                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
657                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
658
659             /* Calculate displacement vector */
660             dx11             = _mm_sub_ps(ix1,jx1);
661             dy11             = _mm_sub_ps(iy1,jy1);
662             dz11             = _mm_sub_ps(iz1,jz1);
663             dx12             = _mm_sub_ps(ix1,jx2);
664             dy12             = _mm_sub_ps(iy1,jy2);
665             dz12             = _mm_sub_ps(iz1,jz2);
666             dx13             = _mm_sub_ps(ix1,jx3);
667             dy13             = _mm_sub_ps(iy1,jy3);
668             dz13             = _mm_sub_ps(iz1,jz3);
669             dx21             = _mm_sub_ps(ix2,jx1);
670             dy21             = _mm_sub_ps(iy2,jy1);
671             dz21             = _mm_sub_ps(iz2,jz1);
672             dx22             = _mm_sub_ps(ix2,jx2);
673             dy22             = _mm_sub_ps(iy2,jy2);
674             dz22             = _mm_sub_ps(iz2,jz2);
675             dx23             = _mm_sub_ps(ix2,jx3);
676             dy23             = _mm_sub_ps(iy2,jy3);
677             dz23             = _mm_sub_ps(iz2,jz3);
678             dx31             = _mm_sub_ps(ix3,jx1);
679             dy31             = _mm_sub_ps(iy3,jy1);
680             dz31             = _mm_sub_ps(iz3,jz1);
681             dx32             = _mm_sub_ps(ix3,jx2);
682             dy32             = _mm_sub_ps(iy3,jy2);
683             dz32             = _mm_sub_ps(iz3,jz2);
684             dx33             = _mm_sub_ps(ix3,jx3);
685             dy33             = _mm_sub_ps(iy3,jy3);
686             dz33             = _mm_sub_ps(iz3,jz3);
687
688             /* Calculate squared distance and things based on it */
689             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
690             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
691             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
692             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
693             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
694             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
695             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
696             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
697             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
698
699             rinv11           = gmx_mm_invsqrt_ps(rsq11);
700             rinv12           = gmx_mm_invsqrt_ps(rsq12);
701             rinv13           = gmx_mm_invsqrt_ps(rsq13);
702             rinv21           = gmx_mm_invsqrt_ps(rsq21);
703             rinv22           = gmx_mm_invsqrt_ps(rsq22);
704             rinv23           = gmx_mm_invsqrt_ps(rsq23);
705             rinv31           = gmx_mm_invsqrt_ps(rsq31);
706             rinv32           = gmx_mm_invsqrt_ps(rsq32);
707             rinv33           = gmx_mm_invsqrt_ps(rsq33);
708
709             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
710             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
711             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
712             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
713             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
714             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
715             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
716             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
717             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
718
719             fjx1             = _mm_setzero_ps();
720             fjy1             = _mm_setzero_ps();
721             fjz1             = _mm_setzero_ps();
722             fjx2             = _mm_setzero_ps();
723             fjy2             = _mm_setzero_ps();
724             fjz2             = _mm_setzero_ps();
725             fjx3             = _mm_setzero_ps();
726             fjy3             = _mm_setzero_ps();
727             fjz3             = _mm_setzero_ps();
728
729             /**************************
730              * CALCULATE INTERACTIONS *
731              **************************/
732
733             if (gmx_mm_any_lt(rsq11,rcutoff2))
734             {
735
736             /* REACTION-FIELD ELECTROSTATICS */
737             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
738             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
739
740             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
741
742             /* Update potential sum for this i atom from the interaction with this j atom. */
743             velec            = _mm_and_ps(velec,cutoff_mask);
744             velec            = _mm_andnot_ps(dummy_mask,velec);
745             velecsum         = _mm_add_ps(velecsum,velec);
746
747             fscal            = felec;
748
749             fscal            = _mm_and_ps(fscal,cutoff_mask);
750
751             fscal            = _mm_andnot_ps(dummy_mask,fscal);
752
753             /* Calculate temporary vectorial force */
754             tx               = _mm_mul_ps(fscal,dx11);
755             ty               = _mm_mul_ps(fscal,dy11);
756             tz               = _mm_mul_ps(fscal,dz11);
757
758             /* Update vectorial force */
759             fix1             = _mm_add_ps(fix1,tx);
760             fiy1             = _mm_add_ps(fiy1,ty);
761             fiz1             = _mm_add_ps(fiz1,tz);
762
763             fjx1             = _mm_add_ps(fjx1,tx);
764             fjy1             = _mm_add_ps(fjy1,ty);
765             fjz1             = _mm_add_ps(fjz1,tz);
766
767             }
768
769             /**************************
770              * CALCULATE INTERACTIONS *
771              **************************/
772
773             if (gmx_mm_any_lt(rsq12,rcutoff2))
774             {
775
776             /* REACTION-FIELD ELECTROSTATICS */
777             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
778             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
779
780             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
781
782             /* Update potential sum for this i atom from the interaction with this j atom. */
783             velec            = _mm_and_ps(velec,cutoff_mask);
784             velec            = _mm_andnot_ps(dummy_mask,velec);
785             velecsum         = _mm_add_ps(velecsum,velec);
786
787             fscal            = felec;
788
789             fscal            = _mm_and_ps(fscal,cutoff_mask);
790
791             fscal            = _mm_andnot_ps(dummy_mask,fscal);
792
793             /* Calculate temporary vectorial force */
794             tx               = _mm_mul_ps(fscal,dx12);
795             ty               = _mm_mul_ps(fscal,dy12);
796             tz               = _mm_mul_ps(fscal,dz12);
797
798             /* Update vectorial force */
799             fix1             = _mm_add_ps(fix1,tx);
800             fiy1             = _mm_add_ps(fiy1,ty);
801             fiz1             = _mm_add_ps(fiz1,tz);
802
803             fjx2             = _mm_add_ps(fjx2,tx);
804             fjy2             = _mm_add_ps(fjy2,ty);
805             fjz2             = _mm_add_ps(fjz2,tz);
806
807             }
808
809             /**************************
810              * CALCULATE INTERACTIONS *
811              **************************/
812
813             if (gmx_mm_any_lt(rsq13,rcutoff2))
814             {
815
816             /* REACTION-FIELD ELECTROSTATICS */
817             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
818             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
819
820             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
821
822             /* Update potential sum for this i atom from the interaction with this j atom. */
823             velec            = _mm_and_ps(velec,cutoff_mask);
824             velec            = _mm_andnot_ps(dummy_mask,velec);
825             velecsum         = _mm_add_ps(velecsum,velec);
826
827             fscal            = felec;
828
829             fscal            = _mm_and_ps(fscal,cutoff_mask);
830
831             fscal            = _mm_andnot_ps(dummy_mask,fscal);
832
833             /* Calculate temporary vectorial force */
834             tx               = _mm_mul_ps(fscal,dx13);
835             ty               = _mm_mul_ps(fscal,dy13);
836             tz               = _mm_mul_ps(fscal,dz13);
837
838             /* Update vectorial force */
839             fix1             = _mm_add_ps(fix1,tx);
840             fiy1             = _mm_add_ps(fiy1,ty);
841             fiz1             = _mm_add_ps(fiz1,tz);
842
843             fjx3             = _mm_add_ps(fjx3,tx);
844             fjy3             = _mm_add_ps(fjy3,ty);
845             fjz3             = _mm_add_ps(fjz3,tz);
846
847             }
848
849             /**************************
850              * CALCULATE INTERACTIONS *
851              **************************/
852
853             if (gmx_mm_any_lt(rsq21,rcutoff2))
854             {
855
856             /* REACTION-FIELD ELECTROSTATICS */
857             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
858             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
859
860             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
861
862             /* Update potential sum for this i atom from the interaction with this j atom. */
863             velec            = _mm_and_ps(velec,cutoff_mask);
864             velec            = _mm_andnot_ps(dummy_mask,velec);
865             velecsum         = _mm_add_ps(velecsum,velec);
866
867             fscal            = felec;
868
869             fscal            = _mm_and_ps(fscal,cutoff_mask);
870
871             fscal            = _mm_andnot_ps(dummy_mask,fscal);
872
873             /* Calculate temporary vectorial force */
874             tx               = _mm_mul_ps(fscal,dx21);
875             ty               = _mm_mul_ps(fscal,dy21);
876             tz               = _mm_mul_ps(fscal,dz21);
877
878             /* Update vectorial force */
879             fix2             = _mm_add_ps(fix2,tx);
880             fiy2             = _mm_add_ps(fiy2,ty);
881             fiz2             = _mm_add_ps(fiz2,tz);
882
883             fjx1             = _mm_add_ps(fjx1,tx);
884             fjy1             = _mm_add_ps(fjy1,ty);
885             fjz1             = _mm_add_ps(fjz1,tz);
886
887             }
888
889             /**************************
890              * CALCULATE INTERACTIONS *
891              **************************/
892
893             if (gmx_mm_any_lt(rsq22,rcutoff2))
894             {
895
896             /* REACTION-FIELD ELECTROSTATICS */
897             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
898             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
899
900             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
901
902             /* Update potential sum for this i atom from the interaction with this j atom. */
903             velec            = _mm_and_ps(velec,cutoff_mask);
904             velec            = _mm_andnot_ps(dummy_mask,velec);
905             velecsum         = _mm_add_ps(velecsum,velec);
906
907             fscal            = felec;
908
909             fscal            = _mm_and_ps(fscal,cutoff_mask);
910
911             fscal            = _mm_andnot_ps(dummy_mask,fscal);
912
913             /* Calculate temporary vectorial force */
914             tx               = _mm_mul_ps(fscal,dx22);
915             ty               = _mm_mul_ps(fscal,dy22);
916             tz               = _mm_mul_ps(fscal,dz22);
917
918             /* Update vectorial force */
919             fix2             = _mm_add_ps(fix2,tx);
920             fiy2             = _mm_add_ps(fiy2,ty);
921             fiz2             = _mm_add_ps(fiz2,tz);
922
923             fjx2             = _mm_add_ps(fjx2,tx);
924             fjy2             = _mm_add_ps(fjy2,ty);
925             fjz2             = _mm_add_ps(fjz2,tz);
926
927             }
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             if (gmx_mm_any_lt(rsq23,rcutoff2))
934             {
935
936             /* REACTION-FIELD ELECTROSTATICS */
937             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
938             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
939
940             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
941
942             /* Update potential sum for this i atom from the interaction with this j atom. */
943             velec            = _mm_and_ps(velec,cutoff_mask);
944             velec            = _mm_andnot_ps(dummy_mask,velec);
945             velecsum         = _mm_add_ps(velecsum,velec);
946
947             fscal            = felec;
948
949             fscal            = _mm_and_ps(fscal,cutoff_mask);
950
951             fscal            = _mm_andnot_ps(dummy_mask,fscal);
952
953             /* Calculate temporary vectorial force */
954             tx               = _mm_mul_ps(fscal,dx23);
955             ty               = _mm_mul_ps(fscal,dy23);
956             tz               = _mm_mul_ps(fscal,dz23);
957
958             /* Update vectorial force */
959             fix2             = _mm_add_ps(fix2,tx);
960             fiy2             = _mm_add_ps(fiy2,ty);
961             fiz2             = _mm_add_ps(fiz2,tz);
962
963             fjx3             = _mm_add_ps(fjx3,tx);
964             fjy3             = _mm_add_ps(fjy3,ty);
965             fjz3             = _mm_add_ps(fjz3,tz);
966
967             }
968
969             /**************************
970              * CALCULATE INTERACTIONS *
971              **************************/
972
973             if (gmx_mm_any_lt(rsq31,rcutoff2))
974             {
975
976             /* REACTION-FIELD ELECTROSTATICS */
977             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
978             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
979
980             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
981
982             /* Update potential sum for this i atom from the interaction with this j atom. */
983             velec            = _mm_and_ps(velec,cutoff_mask);
984             velec            = _mm_andnot_ps(dummy_mask,velec);
985             velecsum         = _mm_add_ps(velecsum,velec);
986
987             fscal            = felec;
988
989             fscal            = _mm_and_ps(fscal,cutoff_mask);
990
991             fscal            = _mm_andnot_ps(dummy_mask,fscal);
992
993             /* Calculate temporary vectorial force */
994             tx               = _mm_mul_ps(fscal,dx31);
995             ty               = _mm_mul_ps(fscal,dy31);
996             tz               = _mm_mul_ps(fscal,dz31);
997
998             /* Update vectorial force */
999             fix3             = _mm_add_ps(fix3,tx);
1000             fiy3             = _mm_add_ps(fiy3,ty);
1001             fiz3             = _mm_add_ps(fiz3,tz);
1002
1003             fjx1             = _mm_add_ps(fjx1,tx);
1004             fjy1             = _mm_add_ps(fjy1,ty);
1005             fjz1             = _mm_add_ps(fjz1,tz);
1006
1007             }
1008
1009             /**************************
1010              * CALCULATE INTERACTIONS *
1011              **************************/
1012
1013             if (gmx_mm_any_lt(rsq32,rcutoff2))
1014             {
1015
1016             /* REACTION-FIELD ELECTROSTATICS */
1017             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1018             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1019
1020             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1021
1022             /* Update potential sum for this i atom from the interaction with this j atom. */
1023             velec            = _mm_and_ps(velec,cutoff_mask);
1024             velec            = _mm_andnot_ps(dummy_mask,velec);
1025             velecsum         = _mm_add_ps(velecsum,velec);
1026
1027             fscal            = felec;
1028
1029             fscal            = _mm_and_ps(fscal,cutoff_mask);
1030
1031             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1032
1033             /* Calculate temporary vectorial force */
1034             tx               = _mm_mul_ps(fscal,dx32);
1035             ty               = _mm_mul_ps(fscal,dy32);
1036             tz               = _mm_mul_ps(fscal,dz32);
1037
1038             /* Update vectorial force */
1039             fix3             = _mm_add_ps(fix3,tx);
1040             fiy3             = _mm_add_ps(fiy3,ty);
1041             fiz3             = _mm_add_ps(fiz3,tz);
1042
1043             fjx2             = _mm_add_ps(fjx2,tx);
1044             fjy2             = _mm_add_ps(fjy2,ty);
1045             fjz2             = _mm_add_ps(fjz2,tz);
1046
1047             }
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             if (gmx_mm_any_lt(rsq33,rcutoff2))
1054             {
1055
1056             /* REACTION-FIELD ELECTROSTATICS */
1057             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1058             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1059
1060             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1061
1062             /* Update potential sum for this i atom from the interaction with this j atom. */
1063             velec            = _mm_and_ps(velec,cutoff_mask);
1064             velec            = _mm_andnot_ps(dummy_mask,velec);
1065             velecsum         = _mm_add_ps(velecsum,velec);
1066
1067             fscal            = felec;
1068
1069             fscal            = _mm_and_ps(fscal,cutoff_mask);
1070
1071             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1072
1073             /* Calculate temporary vectorial force */
1074             tx               = _mm_mul_ps(fscal,dx33);
1075             ty               = _mm_mul_ps(fscal,dy33);
1076             tz               = _mm_mul_ps(fscal,dz33);
1077
1078             /* Update vectorial force */
1079             fix3             = _mm_add_ps(fix3,tx);
1080             fiy3             = _mm_add_ps(fiy3,ty);
1081             fiz3             = _mm_add_ps(fiz3,tz);
1082
1083             fjx3             = _mm_add_ps(fjx3,tx);
1084             fjy3             = _mm_add_ps(fjy3,ty);
1085             fjz3             = _mm_add_ps(fjz3,tz);
1086
1087             }
1088
1089             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1090             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1091             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1092             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1093
1094             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1095                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1096
1097             /* Inner loop uses 324 flops */
1098         }
1099
1100         /* End of innermost loop */
1101
1102         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1103                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1104
1105         ggid                        = gid[iidx];
1106         /* Update potential energies */
1107         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1108
1109         /* Increment number of inner iterations */
1110         inneriter                  += j_index_end - j_index_start;
1111
1112         /* Outer loop uses 19 flops */
1113     }
1114
1115     /* Increment number of outer iterations */
1116     outeriter        += nri;
1117
1118     /* Update outer/inner flops */
1119
1120     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*324);
1121 }
1122 /*
1123  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_single
1124  * Electrostatics interaction: ReactionField
1125  * VdW interaction:            None
1126  * Geometry:                   Water4-Water4
1127  * Calculate force/pot:        Force
1128  */
1129 void
1130 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_single
1131                     (t_nblist                    * gmx_restrict       nlist,
1132                      rvec                        * gmx_restrict          xx,
1133                      rvec                        * gmx_restrict          ff,
1134                      t_forcerec                  * gmx_restrict          fr,
1135                      t_mdatoms                   * gmx_restrict     mdatoms,
1136                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1137                      t_nrnb                      * gmx_restrict        nrnb)
1138 {
1139     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1140      * just 0 for non-waters.
1141      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1142      * jnr indices corresponding to data put in the four positions in the SIMD register.
1143      */
1144     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1145     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1146     int              jnrA,jnrB,jnrC,jnrD;
1147     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1148     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1149     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1150     real             rcutoff_scalar;
1151     real             *shiftvec,*fshift,*x,*f;
1152     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1153     real             scratch[4*DIM];
1154     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1155     int              vdwioffset1;
1156     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1157     int              vdwioffset2;
1158     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1159     int              vdwioffset3;
1160     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1161     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1162     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1163     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1164     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1165     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1166     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1167     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1168     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1169     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1170     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1171     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1172     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1173     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1174     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1175     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1176     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1177     real             *charge;
1178     __m128           dummy_mask,cutoff_mask;
1179     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1180     __m128           one     = _mm_set1_ps(1.0);
1181     __m128           two     = _mm_set1_ps(2.0);
1182     x                = xx[0];
1183     f                = ff[0];
1184
1185     nri              = nlist->nri;
1186     iinr             = nlist->iinr;
1187     jindex           = nlist->jindex;
1188     jjnr             = nlist->jjnr;
1189     shiftidx         = nlist->shift;
1190     gid              = nlist->gid;
1191     shiftvec         = fr->shift_vec[0];
1192     fshift           = fr->fshift[0];
1193     facel            = _mm_set1_ps(fr->epsfac);
1194     charge           = mdatoms->chargeA;
1195     krf              = _mm_set1_ps(fr->ic->k_rf);
1196     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1197     crf              = _mm_set1_ps(fr->ic->c_rf);
1198
1199     /* Setup water-specific parameters */
1200     inr              = nlist->iinr[0];
1201     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1202     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1203     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1204
1205     jq1              = _mm_set1_ps(charge[inr+1]);
1206     jq2              = _mm_set1_ps(charge[inr+2]);
1207     jq3              = _mm_set1_ps(charge[inr+3]);
1208     qq11             = _mm_mul_ps(iq1,jq1);
1209     qq12             = _mm_mul_ps(iq1,jq2);
1210     qq13             = _mm_mul_ps(iq1,jq3);
1211     qq21             = _mm_mul_ps(iq2,jq1);
1212     qq22             = _mm_mul_ps(iq2,jq2);
1213     qq23             = _mm_mul_ps(iq2,jq3);
1214     qq31             = _mm_mul_ps(iq3,jq1);
1215     qq32             = _mm_mul_ps(iq3,jq2);
1216     qq33             = _mm_mul_ps(iq3,jq3);
1217
1218     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1219     rcutoff_scalar   = fr->rcoulomb;
1220     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1221     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1222
1223     /* Avoid stupid compiler warnings */
1224     jnrA = jnrB = jnrC = jnrD = 0;
1225     j_coord_offsetA = 0;
1226     j_coord_offsetB = 0;
1227     j_coord_offsetC = 0;
1228     j_coord_offsetD = 0;
1229
1230     outeriter        = 0;
1231     inneriter        = 0;
1232
1233     for(iidx=0;iidx<4*DIM;iidx++)
1234     {
1235         scratch[iidx] = 0.0;
1236     }
1237
1238     /* Start outer loop over neighborlists */
1239     for(iidx=0; iidx<nri; iidx++)
1240     {
1241         /* Load shift vector for this list */
1242         i_shift_offset   = DIM*shiftidx[iidx];
1243
1244         /* Load limits for loop over neighbors */
1245         j_index_start    = jindex[iidx];
1246         j_index_end      = jindex[iidx+1];
1247
1248         /* Get outer coordinate index */
1249         inr              = iinr[iidx];
1250         i_coord_offset   = DIM*inr;
1251
1252         /* Load i particle coords and add shift vector */
1253         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1254                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1255
1256         fix1             = _mm_setzero_ps();
1257         fiy1             = _mm_setzero_ps();
1258         fiz1             = _mm_setzero_ps();
1259         fix2             = _mm_setzero_ps();
1260         fiy2             = _mm_setzero_ps();
1261         fiz2             = _mm_setzero_ps();
1262         fix3             = _mm_setzero_ps();
1263         fiy3             = _mm_setzero_ps();
1264         fiz3             = _mm_setzero_ps();
1265
1266         /* Start inner kernel loop */
1267         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1268         {
1269
1270             /* Get j neighbor index, and coordinate index */
1271             jnrA             = jjnr[jidx];
1272             jnrB             = jjnr[jidx+1];
1273             jnrC             = jjnr[jidx+2];
1274             jnrD             = jjnr[jidx+3];
1275             j_coord_offsetA  = DIM*jnrA;
1276             j_coord_offsetB  = DIM*jnrB;
1277             j_coord_offsetC  = DIM*jnrC;
1278             j_coord_offsetD  = DIM*jnrD;
1279
1280             /* load j atom coordinates */
1281             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1282                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1283                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1284
1285             /* Calculate displacement vector */
1286             dx11             = _mm_sub_ps(ix1,jx1);
1287             dy11             = _mm_sub_ps(iy1,jy1);
1288             dz11             = _mm_sub_ps(iz1,jz1);
1289             dx12             = _mm_sub_ps(ix1,jx2);
1290             dy12             = _mm_sub_ps(iy1,jy2);
1291             dz12             = _mm_sub_ps(iz1,jz2);
1292             dx13             = _mm_sub_ps(ix1,jx3);
1293             dy13             = _mm_sub_ps(iy1,jy3);
1294             dz13             = _mm_sub_ps(iz1,jz3);
1295             dx21             = _mm_sub_ps(ix2,jx1);
1296             dy21             = _mm_sub_ps(iy2,jy1);
1297             dz21             = _mm_sub_ps(iz2,jz1);
1298             dx22             = _mm_sub_ps(ix2,jx2);
1299             dy22             = _mm_sub_ps(iy2,jy2);
1300             dz22             = _mm_sub_ps(iz2,jz2);
1301             dx23             = _mm_sub_ps(ix2,jx3);
1302             dy23             = _mm_sub_ps(iy2,jy3);
1303             dz23             = _mm_sub_ps(iz2,jz3);
1304             dx31             = _mm_sub_ps(ix3,jx1);
1305             dy31             = _mm_sub_ps(iy3,jy1);
1306             dz31             = _mm_sub_ps(iz3,jz1);
1307             dx32             = _mm_sub_ps(ix3,jx2);
1308             dy32             = _mm_sub_ps(iy3,jy2);
1309             dz32             = _mm_sub_ps(iz3,jz2);
1310             dx33             = _mm_sub_ps(ix3,jx3);
1311             dy33             = _mm_sub_ps(iy3,jy3);
1312             dz33             = _mm_sub_ps(iz3,jz3);
1313
1314             /* Calculate squared distance and things based on it */
1315             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1316             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1317             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1318             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1319             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1320             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1321             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1322             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1323             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1324
1325             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1326             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1327             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1328             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1329             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1330             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1331             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1332             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1333             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1334
1335             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1336             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1337             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1338             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1339             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1340             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1341             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1342             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1343             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1344
1345             fjx1             = _mm_setzero_ps();
1346             fjy1             = _mm_setzero_ps();
1347             fjz1             = _mm_setzero_ps();
1348             fjx2             = _mm_setzero_ps();
1349             fjy2             = _mm_setzero_ps();
1350             fjz2             = _mm_setzero_ps();
1351             fjx3             = _mm_setzero_ps();
1352             fjy3             = _mm_setzero_ps();
1353             fjz3             = _mm_setzero_ps();
1354
1355             /**************************
1356              * CALCULATE INTERACTIONS *
1357              **************************/
1358
1359             if (gmx_mm_any_lt(rsq11,rcutoff2))
1360             {
1361
1362             /* REACTION-FIELD ELECTROSTATICS */
1363             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1364
1365             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1366
1367             fscal            = felec;
1368
1369             fscal            = _mm_and_ps(fscal,cutoff_mask);
1370
1371             /* Calculate temporary vectorial force */
1372             tx               = _mm_mul_ps(fscal,dx11);
1373             ty               = _mm_mul_ps(fscal,dy11);
1374             tz               = _mm_mul_ps(fscal,dz11);
1375
1376             /* Update vectorial force */
1377             fix1             = _mm_add_ps(fix1,tx);
1378             fiy1             = _mm_add_ps(fiy1,ty);
1379             fiz1             = _mm_add_ps(fiz1,tz);
1380
1381             fjx1             = _mm_add_ps(fjx1,tx);
1382             fjy1             = _mm_add_ps(fjy1,ty);
1383             fjz1             = _mm_add_ps(fjz1,tz);
1384
1385             }
1386
1387             /**************************
1388              * CALCULATE INTERACTIONS *
1389              **************************/
1390
1391             if (gmx_mm_any_lt(rsq12,rcutoff2))
1392             {
1393
1394             /* REACTION-FIELD ELECTROSTATICS */
1395             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1396
1397             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1398
1399             fscal            = felec;
1400
1401             fscal            = _mm_and_ps(fscal,cutoff_mask);
1402
1403             /* Calculate temporary vectorial force */
1404             tx               = _mm_mul_ps(fscal,dx12);
1405             ty               = _mm_mul_ps(fscal,dy12);
1406             tz               = _mm_mul_ps(fscal,dz12);
1407
1408             /* Update vectorial force */
1409             fix1             = _mm_add_ps(fix1,tx);
1410             fiy1             = _mm_add_ps(fiy1,ty);
1411             fiz1             = _mm_add_ps(fiz1,tz);
1412
1413             fjx2             = _mm_add_ps(fjx2,tx);
1414             fjy2             = _mm_add_ps(fjy2,ty);
1415             fjz2             = _mm_add_ps(fjz2,tz);
1416
1417             }
1418
1419             /**************************
1420              * CALCULATE INTERACTIONS *
1421              **************************/
1422
1423             if (gmx_mm_any_lt(rsq13,rcutoff2))
1424             {
1425
1426             /* REACTION-FIELD ELECTROSTATICS */
1427             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1428
1429             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1430
1431             fscal            = felec;
1432
1433             fscal            = _mm_and_ps(fscal,cutoff_mask);
1434
1435             /* Calculate temporary vectorial force */
1436             tx               = _mm_mul_ps(fscal,dx13);
1437             ty               = _mm_mul_ps(fscal,dy13);
1438             tz               = _mm_mul_ps(fscal,dz13);
1439
1440             /* Update vectorial force */
1441             fix1             = _mm_add_ps(fix1,tx);
1442             fiy1             = _mm_add_ps(fiy1,ty);
1443             fiz1             = _mm_add_ps(fiz1,tz);
1444
1445             fjx3             = _mm_add_ps(fjx3,tx);
1446             fjy3             = _mm_add_ps(fjy3,ty);
1447             fjz3             = _mm_add_ps(fjz3,tz);
1448
1449             }
1450
1451             /**************************
1452              * CALCULATE INTERACTIONS *
1453              **************************/
1454
1455             if (gmx_mm_any_lt(rsq21,rcutoff2))
1456             {
1457
1458             /* REACTION-FIELD ELECTROSTATICS */
1459             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1460
1461             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1462
1463             fscal            = felec;
1464
1465             fscal            = _mm_and_ps(fscal,cutoff_mask);
1466
1467             /* Calculate temporary vectorial force */
1468             tx               = _mm_mul_ps(fscal,dx21);
1469             ty               = _mm_mul_ps(fscal,dy21);
1470             tz               = _mm_mul_ps(fscal,dz21);
1471
1472             /* Update vectorial force */
1473             fix2             = _mm_add_ps(fix2,tx);
1474             fiy2             = _mm_add_ps(fiy2,ty);
1475             fiz2             = _mm_add_ps(fiz2,tz);
1476
1477             fjx1             = _mm_add_ps(fjx1,tx);
1478             fjy1             = _mm_add_ps(fjy1,ty);
1479             fjz1             = _mm_add_ps(fjz1,tz);
1480
1481             }
1482
1483             /**************************
1484              * CALCULATE INTERACTIONS *
1485              **************************/
1486
1487             if (gmx_mm_any_lt(rsq22,rcutoff2))
1488             {
1489
1490             /* REACTION-FIELD ELECTROSTATICS */
1491             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1492
1493             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1494
1495             fscal            = felec;
1496
1497             fscal            = _mm_and_ps(fscal,cutoff_mask);
1498
1499             /* Calculate temporary vectorial force */
1500             tx               = _mm_mul_ps(fscal,dx22);
1501             ty               = _mm_mul_ps(fscal,dy22);
1502             tz               = _mm_mul_ps(fscal,dz22);
1503
1504             /* Update vectorial force */
1505             fix2             = _mm_add_ps(fix2,tx);
1506             fiy2             = _mm_add_ps(fiy2,ty);
1507             fiz2             = _mm_add_ps(fiz2,tz);
1508
1509             fjx2             = _mm_add_ps(fjx2,tx);
1510             fjy2             = _mm_add_ps(fjy2,ty);
1511             fjz2             = _mm_add_ps(fjz2,tz);
1512
1513             }
1514
1515             /**************************
1516              * CALCULATE INTERACTIONS *
1517              **************************/
1518
1519             if (gmx_mm_any_lt(rsq23,rcutoff2))
1520             {
1521
1522             /* REACTION-FIELD ELECTROSTATICS */
1523             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1524
1525             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1526
1527             fscal            = felec;
1528
1529             fscal            = _mm_and_ps(fscal,cutoff_mask);
1530
1531             /* Calculate temporary vectorial force */
1532             tx               = _mm_mul_ps(fscal,dx23);
1533             ty               = _mm_mul_ps(fscal,dy23);
1534             tz               = _mm_mul_ps(fscal,dz23);
1535
1536             /* Update vectorial force */
1537             fix2             = _mm_add_ps(fix2,tx);
1538             fiy2             = _mm_add_ps(fiy2,ty);
1539             fiz2             = _mm_add_ps(fiz2,tz);
1540
1541             fjx3             = _mm_add_ps(fjx3,tx);
1542             fjy3             = _mm_add_ps(fjy3,ty);
1543             fjz3             = _mm_add_ps(fjz3,tz);
1544
1545             }
1546
1547             /**************************
1548              * CALCULATE INTERACTIONS *
1549              **************************/
1550
1551             if (gmx_mm_any_lt(rsq31,rcutoff2))
1552             {
1553
1554             /* REACTION-FIELD ELECTROSTATICS */
1555             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1556
1557             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1558
1559             fscal            = felec;
1560
1561             fscal            = _mm_and_ps(fscal,cutoff_mask);
1562
1563             /* Calculate temporary vectorial force */
1564             tx               = _mm_mul_ps(fscal,dx31);
1565             ty               = _mm_mul_ps(fscal,dy31);
1566             tz               = _mm_mul_ps(fscal,dz31);
1567
1568             /* Update vectorial force */
1569             fix3             = _mm_add_ps(fix3,tx);
1570             fiy3             = _mm_add_ps(fiy3,ty);
1571             fiz3             = _mm_add_ps(fiz3,tz);
1572
1573             fjx1             = _mm_add_ps(fjx1,tx);
1574             fjy1             = _mm_add_ps(fjy1,ty);
1575             fjz1             = _mm_add_ps(fjz1,tz);
1576
1577             }
1578
1579             /**************************
1580              * CALCULATE INTERACTIONS *
1581              **************************/
1582
1583             if (gmx_mm_any_lt(rsq32,rcutoff2))
1584             {
1585
1586             /* REACTION-FIELD ELECTROSTATICS */
1587             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1588
1589             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1590
1591             fscal            = felec;
1592
1593             fscal            = _mm_and_ps(fscal,cutoff_mask);
1594
1595             /* Calculate temporary vectorial force */
1596             tx               = _mm_mul_ps(fscal,dx32);
1597             ty               = _mm_mul_ps(fscal,dy32);
1598             tz               = _mm_mul_ps(fscal,dz32);
1599
1600             /* Update vectorial force */
1601             fix3             = _mm_add_ps(fix3,tx);
1602             fiy3             = _mm_add_ps(fiy3,ty);
1603             fiz3             = _mm_add_ps(fiz3,tz);
1604
1605             fjx2             = _mm_add_ps(fjx2,tx);
1606             fjy2             = _mm_add_ps(fjy2,ty);
1607             fjz2             = _mm_add_ps(fjz2,tz);
1608
1609             }
1610
1611             /**************************
1612              * CALCULATE INTERACTIONS *
1613              **************************/
1614
1615             if (gmx_mm_any_lt(rsq33,rcutoff2))
1616             {
1617
1618             /* REACTION-FIELD ELECTROSTATICS */
1619             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1620
1621             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1622
1623             fscal            = felec;
1624
1625             fscal            = _mm_and_ps(fscal,cutoff_mask);
1626
1627             /* Calculate temporary vectorial force */
1628             tx               = _mm_mul_ps(fscal,dx33);
1629             ty               = _mm_mul_ps(fscal,dy33);
1630             tz               = _mm_mul_ps(fscal,dz33);
1631
1632             /* Update vectorial force */
1633             fix3             = _mm_add_ps(fix3,tx);
1634             fiy3             = _mm_add_ps(fiy3,ty);
1635             fiz3             = _mm_add_ps(fiz3,tz);
1636
1637             fjx3             = _mm_add_ps(fjx3,tx);
1638             fjy3             = _mm_add_ps(fjy3,ty);
1639             fjz3             = _mm_add_ps(fjz3,tz);
1640
1641             }
1642
1643             fjptrA             = f+j_coord_offsetA;
1644             fjptrB             = f+j_coord_offsetB;
1645             fjptrC             = f+j_coord_offsetC;
1646             fjptrD             = f+j_coord_offsetD;
1647
1648             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1649                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1650
1651             /* Inner loop uses 270 flops */
1652         }
1653
1654         if(jidx<j_index_end)
1655         {
1656
1657             /* Get j neighbor index, and coordinate index */
1658             jnrlistA         = jjnr[jidx];
1659             jnrlistB         = jjnr[jidx+1];
1660             jnrlistC         = jjnr[jidx+2];
1661             jnrlistD         = jjnr[jidx+3];
1662             /* Sign of each element will be negative for non-real atoms.
1663              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1664              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1665              */
1666             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1667             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1668             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1669             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1670             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1671             j_coord_offsetA  = DIM*jnrA;
1672             j_coord_offsetB  = DIM*jnrB;
1673             j_coord_offsetC  = DIM*jnrC;
1674             j_coord_offsetD  = DIM*jnrD;
1675
1676             /* load j atom coordinates */
1677             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1678                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1679                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1680
1681             /* Calculate displacement vector */
1682             dx11             = _mm_sub_ps(ix1,jx1);
1683             dy11             = _mm_sub_ps(iy1,jy1);
1684             dz11             = _mm_sub_ps(iz1,jz1);
1685             dx12             = _mm_sub_ps(ix1,jx2);
1686             dy12             = _mm_sub_ps(iy1,jy2);
1687             dz12             = _mm_sub_ps(iz1,jz2);
1688             dx13             = _mm_sub_ps(ix1,jx3);
1689             dy13             = _mm_sub_ps(iy1,jy3);
1690             dz13             = _mm_sub_ps(iz1,jz3);
1691             dx21             = _mm_sub_ps(ix2,jx1);
1692             dy21             = _mm_sub_ps(iy2,jy1);
1693             dz21             = _mm_sub_ps(iz2,jz1);
1694             dx22             = _mm_sub_ps(ix2,jx2);
1695             dy22             = _mm_sub_ps(iy2,jy2);
1696             dz22             = _mm_sub_ps(iz2,jz2);
1697             dx23             = _mm_sub_ps(ix2,jx3);
1698             dy23             = _mm_sub_ps(iy2,jy3);
1699             dz23             = _mm_sub_ps(iz2,jz3);
1700             dx31             = _mm_sub_ps(ix3,jx1);
1701             dy31             = _mm_sub_ps(iy3,jy1);
1702             dz31             = _mm_sub_ps(iz3,jz1);
1703             dx32             = _mm_sub_ps(ix3,jx2);
1704             dy32             = _mm_sub_ps(iy3,jy2);
1705             dz32             = _mm_sub_ps(iz3,jz2);
1706             dx33             = _mm_sub_ps(ix3,jx3);
1707             dy33             = _mm_sub_ps(iy3,jy3);
1708             dz33             = _mm_sub_ps(iz3,jz3);
1709
1710             /* Calculate squared distance and things based on it */
1711             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1712             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1713             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1714             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1715             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1716             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1717             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1718             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1719             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1720
1721             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1722             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1723             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1724             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1725             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1726             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1727             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1728             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1729             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1730
1731             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1732             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1733             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1734             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1735             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1736             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1737             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1738             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1739             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1740
1741             fjx1             = _mm_setzero_ps();
1742             fjy1             = _mm_setzero_ps();
1743             fjz1             = _mm_setzero_ps();
1744             fjx2             = _mm_setzero_ps();
1745             fjy2             = _mm_setzero_ps();
1746             fjz2             = _mm_setzero_ps();
1747             fjx3             = _mm_setzero_ps();
1748             fjy3             = _mm_setzero_ps();
1749             fjz3             = _mm_setzero_ps();
1750
1751             /**************************
1752              * CALCULATE INTERACTIONS *
1753              **************************/
1754
1755             if (gmx_mm_any_lt(rsq11,rcutoff2))
1756             {
1757
1758             /* REACTION-FIELD ELECTROSTATICS */
1759             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1760
1761             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1762
1763             fscal            = felec;
1764
1765             fscal            = _mm_and_ps(fscal,cutoff_mask);
1766
1767             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1768
1769             /* Calculate temporary vectorial force */
1770             tx               = _mm_mul_ps(fscal,dx11);
1771             ty               = _mm_mul_ps(fscal,dy11);
1772             tz               = _mm_mul_ps(fscal,dz11);
1773
1774             /* Update vectorial force */
1775             fix1             = _mm_add_ps(fix1,tx);
1776             fiy1             = _mm_add_ps(fiy1,ty);
1777             fiz1             = _mm_add_ps(fiz1,tz);
1778
1779             fjx1             = _mm_add_ps(fjx1,tx);
1780             fjy1             = _mm_add_ps(fjy1,ty);
1781             fjz1             = _mm_add_ps(fjz1,tz);
1782
1783             }
1784
1785             /**************************
1786              * CALCULATE INTERACTIONS *
1787              **************************/
1788
1789             if (gmx_mm_any_lt(rsq12,rcutoff2))
1790             {
1791
1792             /* REACTION-FIELD ELECTROSTATICS */
1793             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1794
1795             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1796
1797             fscal            = felec;
1798
1799             fscal            = _mm_and_ps(fscal,cutoff_mask);
1800
1801             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1802
1803             /* Calculate temporary vectorial force */
1804             tx               = _mm_mul_ps(fscal,dx12);
1805             ty               = _mm_mul_ps(fscal,dy12);
1806             tz               = _mm_mul_ps(fscal,dz12);
1807
1808             /* Update vectorial force */
1809             fix1             = _mm_add_ps(fix1,tx);
1810             fiy1             = _mm_add_ps(fiy1,ty);
1811             fiz1             = _mm_add_ps(fiz1,tz);
1812
1813             fjx2             = _mm_add_ps(fjx2,tx);
1814             fjy2             = _mm_add_ps(fjy2,ty);
1815             fjz2             = _mm_add_ps(fjz2,tz);
1816
1817             }
1818
1819             /**************************
1820              * CALCULATE INTERACTIONS *
1821              **************************/
1822
1823             if (gmx_mm_any_lt(rsq13,rcutoff2))
1824             {
1825
1826             /* REACTION-FIELD ELECTROSTATICS */
1827             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1828
1829             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1830
1831             fscal            = felec;
1832
1833             fscal            = _mm_and_ps(fscal,cutoff_mask);
1834
1835             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1836
1837             /* Calculate temporary vectorial force */
1838             tx               = _mm_mul_ps(fscal,dx13);
1839             ty               = _mm_mul_ps(fscal,dy13);
1840             tz               = _mm_mul_ps(fscal,dz13);
1841
1842             /* Update vectorial force */
1843             fix1             = _mm_add_ps(fix1,tx);
1844             fiy1             = _mm_add_ps(fiy1,ty);
1845             fiz1             = _mm_add_ps(fiz1,tz);
1846
1847             fjx3             = _mm_add_ps(fjx3,tx);
1848             fjy3             = _mm_add_ps(fjy3,ty);
1849             fjz3             = _mm_add_ps(fjz3,tz);
1850
1851             }
1852
1853             /**************************
1854              * CALCULATE INTERACTIONS *
1855              **************************/
1856
1857             if (gmx_mm_any_lt(rsq21,rcutoff2))
1858             {
1859
1860             /* REACTION-FIELD ELECTROSTATICS */
1861             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1862
1863             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1864
1865             fscal            = felec;
1866
1867             fscal            = _mm_and_ps(fscal,cutoff_mask);
1868
1869             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1870
1871             /* Calculate temporary vectorial force */
1872             tx               = _mm_mul_ps(fscal,dx21);
1873             ty               = _mm_mul_ps(fscal,dy21);
1874             tz               = _mm_mul_ps(fscal,dz21);
1875
1876             /* Update vectorial force */
1877             fix2             = _mm_add_ps(fix2,tx);
1878             fiy2             = _mm_add_ps(fiy2,ty);
1879             fiz2             = _mm_add_ps(fiz2,tz);
1880
1881             fjx1             = _mm_add_ps(fjx1,tx);
1882             fjy1             = _mm_add_ps(fjy1,ty);
1883             fjz1             = _mm_add_ps(fjz1,tz);
1884
1885             }
1886
1887             /**************************
1888              * CALCULATE INTERACTIONS *
1889              **************************/
1890
1891             if (gmx_mm_any_lt(rsq22,rcutoff2))
1892             {
1893
1894             /* REACTION-FIELD ELECTROSTATICS */
1895             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1896
1897             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1898
1899             fscal            = felec;
1900
1901             fscal            = _mm_and_ps(fscal,cutoff_mask);
1902
1903             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1904
1905             /* Calculate temporary vectorial force */
1906             tx               = _mm_mul_ps(fscal,dx22);
1907             ty               = _mm_mul_ps(fscal,dy22);
1908             tz               = _mm_mul_ps(fscal,dz22);
1909
1910             /* Update vectorial force */
1911             fix2             = _mm_add_ps(fix2,tx);
1912             fiy2             = _mm_add_ps(fiy2,ty);
1913             fiz2             = _mm_add_ps(fiz2,tz);
1914
1915             fjx2             = _mm_add_ps(fjx2,tx);
1916             fjy2             = _mm_add_ps(fjy2,ty);
1917             fjz2             = _mm_add_ps(fjz2,tz);
1918
1919             }
1920
1921             /**************************
1922              * CALCULATE INTERACTIONS *
1923              **************************/
1924
1925             if (gmx_mm_any_lt(rsq23,rcutoff2))
1926             {
1927
1928             /* REACTION-FIELD ELECTROSTATICS */
1929             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1930
1931             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1932
1933             fscal            = felec;
1934
1935             fscal            = _mm_and_ps(fscal,cutoff_mask);
1936
1937             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1938
1939             /* Calculate temporary vectorial force */
1940             tx               = _mm_mul_ps(fscal,dx23);
1941             ty               = _mm_mul_ps(fscal,dy23);
1942             tz               = _mm_mul_ps(fscal,dz23);
1943
1944             /* Update vectorial force */
1945             fix2             = _mm_add_ps(fix2,tx);
1946             fiy2             = _mm_add_ps(fiy2,ty);
1947             fiz2             = _mm_add_ps(fiz2,tz);
1948
1949             fjx3             = _mm_add_ps(fjx3,tx);
1950             fjy3             = _mm_add_ps(fjy3,ty);
1951             fjz3             = _mm_add_ps(fjz3,tz);
1952
1953             }
1954
1955             /**************************
1956              * CALCULATE INTERACTIONS *
1957              **************************/
1958
1959             if (gmx_mm_any_lt(rsq31,rcutoff2))
1960             {
1961
1962             /* REACTION-FIELD ELECTROSTATICS */
1963             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1964
1965             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1966
1967             fscal            = felec;
1968
1969             fscal            = _mm_and_ps(fscal,cutoff_mask);
1970
1971             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1972
1973             /* Calculate temporary vectorial force */
1974             tx               = _mm_mul_ps(fscal,dx31);
1975             ty               = _mm_mul_ps(fscal,dy31);
1976             tz               = _mm_mul_ps(fscal,dz31);
1977
1978             /* Update vectorial force */
1979             fix3             = _mm_add_ps(fix3,tx);
1980             fiy3             = _mm_add_ps(fiy3,ty);
1981             fiz3             = _mm_add_ps(fiz3,tz);
1982
1983             fjx1             = _mm_add_ps(fjx1,tx);
1984             fjy1             = _mm_add_ps(fjy1,ty);
1985             fjz1             = _mm_add_ps(fjz1,tz);
1986
1987             }
1988
1989             /**************************
1990              * CALCULATE INTERACTIONS *
1991              **************************/
1992
1993             if (gmx_mm_any_lt(rsq32,rcutoff2))
1994             {
1995
1996             /* REACTION-FIELD ELECTROSTATICS */
1997             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1998
1999             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2000
2001             fscal            = felec;
2002
2003             fscal            = _mm_and_ps(fscal,cutoff_mask);
2004
2005             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2006
2007             /* Calculate temporary vectorial force */
2008             tx               = _mm_mul_ps(fscal,dx32);
2009             ty               = _mm_mul_ps(fscal,dy32);
2010             tz               = _mm_mul_ps(fscal,dz32);
2011
2012             /* Update vectorial force */
2013             fix3             = _mm_add_ps(fix3,tx);
2014             fiy3             = _mm_add_ps(fiy3,ty);
2015             fiz3             = _mm_add_ps(fiz3,tz);
2016
2017             fjx2             = _mm_add_ps(fjx2,tx);
2018             fjy2             = _mm_add_ps(fjy2,ty);
2019             fjz2             = _mm_add_ps(fjz2,tz);
2020
2021             }
2022
2023             /**************************
2024              * CALCULATE INTERACTIONS *
2025              **************************/
2026
2027             if (gmx_mm_any_lt(rsq33,rcutoff2))
2028             {
2029
2030             /* REACTION-FIELD ELECTROSTATICS */
2031             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2032
2033             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2034
2035             fscal            = felec;
2036
2037             fscal            = _mm_and_ps(fscal,cutoff_mask);
2038
2039             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2040
2041             /* Calculate temporary vectorial force */
2042             tx               = _mm_mul_ps(fscal,dx33);
2043             ty               = _mm_mul_ps(fscal,dy33);
2044             tz               = _mm_mul_ps(fscal,dz33);
2045
2046             /* Update vectorial force */
2047             fix3             = _mm_add_ps(fix3,tx);
2048             fiy3             = _mm_add_ps(fiy3,ty);
2049             fiz3             = _mm_add_ps(fiz3,tz);
2050
2051             fjx3             = _mm_add_ps(fjx3,tx);
2052             fjy3             = _mm_add_ps(fjy3,ty);
2053             fjz3             = _mm_add_ps(fjz3,tz);
2054
2055             }
2056
2057             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2058             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2059             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2060             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2061
2062             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2063                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2064
2065             /* Inner loop uses 270 flops */
2066         }
2067
2068         /* End of innermost loop */
2069
2070         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2071                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2072
2073         /* Increment number of inner iterations */
2074         inneriter                  += j_index_end - j_index_start;
2075
2076         /* Outer loop uses 18 flops */
2077     }
2078
2079     /* Increment number of outer iterations */
2080     outeriter        += nri;
2081
2082     /* Update outer/inner flops */
2083
2084     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
2085 }