Created SIMD module
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse4_1_single
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            None
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse4_1_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m128           dummy_mask,cutoff_mask;
109     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
110     __m128           one     = _mm_set1_ps(1.0);
111     __m128           two     = _mm_set1_ps(2.0);
112     x                = xx[0];
113     f                = ff[0];
114
115     nri              = nlist->nri;
116     iinr             = nlist->iinr;
117     jindex           = nlist->jindex;
118     jjnr             = nlist->jjnr;
119     shiftidx         = nlist->shift;
120     gid              = nlist->gid;
121     shiftvec         = fr->shift_vec[0];
122     fshift           = fr->fshift[0];
123     facel            = _mm_set1_ps(fr->epsfac);
124     charge           = mdatoms->chargeA;
125     krf              = _mm_set1_ps(fr->ic->k_rf);
126     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
127     crf              = _mm_set1_ps(fr->ic->c_rf);
128
129     /* Setup water-specific parameters */
130     inr              = nlist->iinr[0];
131     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
132     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
133     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
134
135     jq0              = _mm_set1_ps(charge[inr+0]);
136     jq1              = _mm_set1_ps(charge[inr+1]);
137     jq2              = _mm_set1_ps(charge[inr+2]);
138     qq00             = _mm_mul_ps(iq0,jq0);
139     qq01             = _mm_mul_ps(iq0,jq1);
140     qq02             = _mm_mul_ps(iq0,jq2);
141     qq10             = _mm_mul_ps(iq1,jq0);
142     qq11             = _mm_mul_ps(iq1,jq1);
143     qq12             = _mm_mul_ps(iq1,jq2);
144     qq20             = _mm_mul_ps(iq2,jq0);
145     qq21             = _mm_mul_ps(iq2,jq1);
146     qq22             = _mm_mul_ps(iq2,jq2);
147
148     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
149     rcutoff_scalar   = fr->rcoulomb;
150     rcutoff          = _mm_set1_ps(rcutoff_scalar);
151     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
152
153     /* Avoid stupid compiler warnings */
154     jnrA = jnrB = jnrC = jnrD = 0;
155     j_coord_offsetA = 0;
156     j_coord_offsetB = 0;
157     j_coord_offsetC = 0;
158     j_coord_offsetD = 0;
159
160     outeriter        = 0;
161     inneriter        = 0;
162
163     for(iidx=0;iidx<4*DIM;iidx++)
164     {
165         scratch[iidx] = 0.0;
166     }
167
168     /* Start outer loop over neighborlists */
169     for(iidx=0; iidx<nri; iidx++)
170     {
171         /* Load shift vector for this list */
172         i_shift_offset   = DIM*shiftidx[iidx];
173
174         /* Load limits for loop over neighbors */
175         j_index_start    = jindex[iidx];
176         j_index_end      = jindex[iidx+1];
177
178         /* Get outer coordinate index */
179         inr              = iinr[iidx];
180         i_coord_offset   = DIM*inr;
181
182         /* Load i particle coords and add shift vector */
183         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
184                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
185
186         fix0             = _mm_setzero_ps();
187         fiy0             = _mm_setzero_ps();
188         fiz0             = _mm_setzero_ps();
189         fix1             = _mm_setzero_ps();
190         fiy1             = _mm_setzero_ps();
191         fiz1             = _mm_setzero_ps();
192         fix2             = _mm_setzero_ps();
193         fiy2             = _mm_setzero_ps();
194         fiz2             = _mm_setzero_ps();
195
196         /* Reset potential sums */
197         velecsum         = _mm_setzero_ps();
198
199         /* Start inner kernel loop */
200         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201         {
202
203             /* Get j neighbor index, and coordinate index */
204             jnrA             = jjnr[jidx];
205             jnrB             = jjnr[jidx+1];
206             jnrC             = jjnr[jidx+2];
207             jnrD             = jjnr[jidx+3];
208             j_coord_offsetA  = DIM*jnrA;
209             j_coord_offsetB  = DIM*jnrB;
210             j_coord_offsetC  = DIM*jnrC;
211             j_coord_offsetD  = DIM*jnrD;
212
213             /* load j atom coordinates */
214             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
215                                               x+j_coord_offsetC,x+j_coord_offsetD,
216                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
217
218             /* Calculate displacement vector */
219             dx00             = _mm_sub_ps(ix0,jx0);
220             dy00             = _mm_sub_ps(iy0,jy0);
221             dz00             = _mm_sub_ps(iz0,jz0);
222             dx01             = _mm_sub_ps(ix0,jx1);
223             dy01             = _mm_sub_ps(iy0,jy1);
224             dz01             = _mm_sub_ps(iz0,jz1);
225             dx02             = _mm_sub_ps(ix0,jx2);
226             dy02             = _mm_sub_ps(iy0,jy2);
227             dz02             = _mm_sub_ps(iz0,jz2);
228             dx10             = _mm_sub_ps(ix1,jx0);
229             dy10             = _mm_sub_ps(iy1,jy0);
230             dz10             = _mm_sub_ps(iz1,jz0);
231             dx11             = _mm_sub_ps(ix1,jx1);
232             dy11             = _mm_sub_ps(iy1,jy1);
233             dz11             = _mm_sub_ps(iz1,jz1);
234             dx12             = _mm_sub_ps(ix1,jx2);
235             dy12             = _mm_sub_ps(iy1,jy2);
236             dz12             = _mm_sub_ps(iz1,jz2);
237             dx20             = _mm_sub_ps(ix2,jx0);
238             dy20             = _mm_sub_ps(iy2,jy0);
239             dz20             = _mm_sub_ps(iz2,jz0);
240             dx21             = _mm_sub_ps(ix2,jx1);
241             dy21             = _mm_sub_ps(iy2,jy1);
242             dz21             = _mm_sub_ps(iz2,jz1);
243             dx22             = _mm_sub_ps(ix2,jx2);
244             dy22             = _mm_sub_ps(iy2,jy2);
245             dz22             = _mm_sub_ps(iz2,jz2);
246
247             /* Calculate squared distance and things based on it */
248             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
249             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
250             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
251             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
252             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
253             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
254             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
255             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
256             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
257
258             rinv00           = gmx_mm_invsqrt_ps(rsq00);
259             rinv01           = gmx_mm_invsqrt_ps(rsq01);
260             rinv02           = gmx_mm_invsqrt_ps(rsq02);
261             rinv10           = gmx_mm_invsqrt_ps(rsq10);
262             rinv11           = gmx_mm_invsqrt_ps(rsq11);
263             rinv12           = gmx_mm_invsqrt_ps(rsq12);
264             rinv20           = gmx_mm_invsqrt_ps(rsq20);
265             rinv21           = gmx_mm_invsqrt_ps(rsq21);
266             rinv22           = gmx_mm_invsqrt_ps(rsq22);
267
268             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
269             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
270             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
271             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
272             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
273             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
274             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
275             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
276             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
277
278             fjx0             = _mm_setzero_ps();
279             fjy0             = _mm_setzero_ps();
280             fjz0             = _mm_setzero_ps();
281             fjx1             = _mm_setzero_ps();
282             fjy1             = _mm_setzero_ps();
283             fjz1             = _mm_setzero_ps();
284             fjx2             = _mm_setzero_ps();
285             fjy2             = _mm_setzero_ps();
286             fjz2             = _mm_setzero_ps();
287
288             /**************************
289              * CALCULATE INTERACTIONS *
290              **************************/
291
292             if (gmx_mm_any_lt(rsq00,rcutoff2))
293             {
294
295             /* REACTION-FIELD ELECTROSTATICS */
296             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
297             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
298
299             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
300
301             /* Update potential sum for this i atom from the interaction with this j atom. */
302             velec            = _mm_and_ps(velec,cutoff_mask);
303             velecsum         = _mm_add_ps(velecsum,velec);
304
305             fscal            = felec;
306
307             fscal            = _mm_and_ps(fscal,cutoff_mask);
308
309             /* Calculate temporary vectorial force */
310             tx               = _mm_mul_ps(fscal,dx00);
311             ty               = _mm_mul_ps(fscal,dy00);
312             tz               = _mm_mul_ps(fscal,dz00);
313
314             /* Update vectorial force */
315             fix0             = _mm_add_ps(fix0,tx);
316             fiy0             = _mm_add_ps(fiy0,ty);
317             fiz0             = _mm_add_ps(fiz0,tz);
318
319             fjx0             = _mm_add_ps(fjx0,tx);
320             fjy0             = _mm_add_ps(fjy0,ty);
321             fjz0             = _mm_add_ps(fjz0,tz);
322
323             }
324
325             /**************************
326              * CALCULATE INTERACTIONS *
327              **************************/
328
329             if (gmx_mm_any_lt(rsq01,rcutoff2))
330             {
331
332             /* REACTION-FIELD ELECTROSTATICS */
333             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
334             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
335
336             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
337
338             /* Update potential sum for this i atom from the interaction with this j atom. */
339             velec            = _mm_and_ps(velec,cutoff_mask);
340             velecsum         = _mm_add_ps(velecsum,velec);
341
342             fscal            = felec;
343
344             fscal            = _mm_and_ps(fscal,cutoff_mask);
345
346             /* Calculate temporary vectorial force */
347             tx               = _mm_mul_ps(fscal,dx01);
348             ty               = _mm_mul_ps(fscal,dy01);
349             tz               = _mm_mul_ps(fscal,dz01);
350
351             /* Update vectorial force */
352             fix0             = _mm_add_ps(fix0,tx);
353             fiy0             = _mm_add_ps(fiy0,ty);
354             fiz0             = _mm_add_ps(fiz0,tz);
355
356             fjx1             = _mm_add_ps(fjx1,tx);
357             fjy1             = _mm_add_ps(fjy1,ty);
358             fjz1             = _mm_add_ps(fjz1,tz);
359
360             }
361
362             /**************************
363              * CALCULATE INTERACTIONS *
364              **************************/
365
366             if (gmx_mm_any_lt(rsq02,rcutoff2))
367             {
368
369             /* REACTION-FIELD ELECTROSTATICS */
370             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
371             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
372
373             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
374
375             /* Update potential sum for this i atom from the interaction with this j atom. */
376             velec            = _mm_and_ps(velec,cutoff_mask);
377             velecsum         = _mm_add_ps(velecsum,velec);
378
379             fscal            = felec;
380
381             fscal            = _mm_and_ps(fscal,cutoff_mask);
382
383             /* Calculate temporary vectorial force */
384             tx               = _mm_mul_ps(fscal,dx02);
385             ty               = _mm_mul_ps(fscal,dy02);
386             tz               = _mm_mul_ps(fscal,dz02);
387
388             /* Update vectorial force */
389             fix0             = _mm_add_ps(fix0,tx);
390             fiy0             = _mm_add_ps(fiy0,ty);
391             fiz0             = _mm_add_ps(fiz0,tz);
392
393             fjx2             = _mm_add_ps(fjx2,tx);
394             fjy2             = _mm_add_ps(fjy2,ty);
395             fjz2             = _mm_add_ps(fjz2,tz);
396
397             }
398
399             /**************************
400              * CALCULATE INTERACTIONS *
401              **************************/
402
403             if (gmx_mm_any_lt(rsq10,rcutoff2))
404             {
405
406             /* REACTION-FIELD ELECTROSTATICS */
407             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
408             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
409
410             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
411
412             /* Update potential sum for this i atom from the interaction with this j atom. */
413             velec            = _mm_and_ps(velec,cutoff_mask);
414             velecsum         = _mm_add_ps(velecsum,velec);
415
416             fscal            = felec;
417
418             fscal            = _mm_and_ps(fscal,cutoff_mask);
419
420             /* Calculate temporary vectorial force */
421             tx               = _mm_mul_ps(fscal,dx10);
422             ty               = _mm_mul_ps(fscal,dy10);
423             tz               = _mm_mul_ps(fscal,dz10);
424
425             /* Update vectorial force */
426             fix1             = _mm_add_ps(fix1,tx);
427             fiy1             = _mm_add_ps(fiy1,ty);
428             fiz1             = _mm_add_ps(fiz1,tz);
429
430             fjx0             = _mm_add_ps(fjx0,tx);
431             fjy0             = _mm_add_ps(fjy0,ty);
432             fjz0             = _mm_add_ps(fjz0,tz);
433
434             }
435
436             /**************************
437              * CALCULATE INTERACTIONS *
438              **************************/
439
440             if (gmx_mm_any_lt(rsq11,rcutoff2))
441             {
442
443             /* REACTION-FIELD ELECTROSTATICS */
444             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
445             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
446
447             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
448
449             /* Update potential sum for this i atom from the interaction with this j atom. */
450             velec            = _mm_and_ps(velec,cutoff_mask);
451             velecsum         = _mm_add_ps(velecsum,velec);
452
453             fscal            = felec;
454
455             fscal            = _mm_and_ps(fscal,cutoff_mask);
456
457             /* Calculate temporary vectorial force */
458             tx               = _mm_mul_ps(fscal,dx11);
459             ty               = _mm_mul_ps(fscal,dy11);
460             tz               = _mm_mul_ps(fscal,dz11);
461
462             /* Update vectorial force */
463             fix1             = _mm_add_ps(fix1,tx);
464             fiy1             = _mm_add_ps(fiy1,ty);
465             fiz1             = _mm_add_ps(fiz1,tz);
466
467             fjx1             = _mm_add_ps(fjx1,tx);
468             fjy1             = _mm_add_ps(fjy1,ty);
469             fjz1             = _mm_add_ps(fjz1,tz);
470
471             }
472
473             /**************************
474              * CALCULATE INTERACTIONS *
475              **************************/
476
477             if (gmx_mm_any_lt(rsq12,rcutoff2))
478             {
479
480             /* REACTION-FIELD ELECTROSTATICS */
481             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
482             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
483
484             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
485
486             /* Update potential sum for this i atom from the interaction with this j atom. */
487             velec            = _mm_and_ps(velec,cutoff_mask);
488             velecsum         = _mm_add_ps(velecsum,velec);
489
490             fscal            = felec;
491
492             fscal            = _mm_and_ps(fscal,cutoff_mask);
493
494             /* Calculate temporary vectorial force */
495             tx               = _mm_mul_ps(fscal,dx12);
496             ty               = _mm_mul_ps(fscal,dy12);
497             tz               = _mm_mul_ps(fscal,dz12);
498
499             /* Update vectorial force */
500             fix1             = _mm_add_ps(fix1,tx);
501             fiy1             = _mm_add_ps(fiy1,ty);
502             fiz1             = _mm_add_ps(fiz1,tz);
503
504             fjx2             = _mm_add_ps(fjx2,tx);
505             fjy2             = _mm_add_ps(fjy2,ty);
506             fjz2             = _mm_add_ps(fjz2,tz);
507
508             }
509
510             /**************************
511              * CALCULATE INTERACTIONS *
512              **************************/
513
514             if (gmx_mm_any_lt(rsq20,rcutoff2))
515             {
516
517             /* REACTION-FIELD ELECTROSTATICS */
518             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
519             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
520
521             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
522
523             /* Update potential sum for this i atom from the interaction with this j atom. */
524             velec            = _mm_and_ps(velec,cutoff_mask);
525             velecsum         = _mm_add_ps(velecsum,velec);
526
527             fscal            = felec;
528
529             fscal            = _mm_and_ps(fscal,cutoff_mask);
530
531             /* Calculate temporary vectorial force */
532             tx               = _mm_mul_ps(fscal,dx20);
533             ty               = _mm_mul_ps(fscal,dy20);
534             tz               = _mm_mul_ps(fscal,dz20);
535
536             /* Update vectorial force */
537             fix2             = _mm_add_ps(fix2,tx);
538             fiy2             = _mm_add_ps(fiy2,ty);
539             fiz2             = _mm_add_ps(fiz2,tz);
540
541             fjx0             = _mm_add_ps(fjx0,tx);
542             fjy0             = _mm_add_ps(fjy0,ty);
543             fjz0             = _mm_add_ps(fjz0,tz);
544
545             }
546
547             /**************************
548              * CALCULATE INTERACTIONS *
549              **************************/
550
551             if (gmx_mm_any_lt(rsq21,rcutoff2))
552             {
553
554             /* REACTION-FIELD ELECTROSTATICS */
555             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
556             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
557
558             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
559
560             /* Update potential sum for this i atom from the interaction with this j atom. */
561             velec            = _mm_and_ps(velec,cutoff_mask);
562             velecsum         = _mm_add_ps(velecsum,velec);
563
564             fscal            = felec;
565
566             fscal            = _mm_and_ps(fscal,cutoff_mask);
567
568             /* Calculate temporary vectorial force */
569             tx               = _mm_mul_ps(fscal,dx21);
570             ty               = _mm_mul_ps(fscal,dy21);
571             tz               = _mm_mul_ps(fscal,dz21);
572
573             /* Update vectorial force */
574             fix2             = _mm_add_ps(fix2,tx);
575             fiy2             = _mm_add_ps(fiy2,ty);
576             fiz2             = _mm_add_ps(fiz2,tz);
577
578             fjx1             = _mm_add_ps(fjx1,tx);
579             fjy1             = _mm_add_ps(fjy1,ty);
580             fjz1             = _mm_add_ps(fjz1,tz);
581
582             }
583
584             /**************************
585              * CALCULATE INTERACTIONS *
586              **************************/
587
588             if (gmx_mm_any_lt(rsq22,rcutoff2))
589             {
590
591             /* REACTION-FIELD ELECTROSTATICS */
592             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
593             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
594
595             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
596
597             /* Update potential sum for this i atom from the interaction with this j atom. */
598             velec            = _mm_and_ps(velec,cutoff_mask);
599             velecsum         = _mm_add_ps(velecsum,velec);
600
601             fscal            = felec;
602
603             fscal            = _mm_and_ps(fscal,cutoff_mask);
604
605             /* Calculate temporary vectorial force */
606             tx               = _mm_mul_ps(fscal,dx22);
607             ty               = _mm_mul_ps(fscal,dy22);
608             tz               = _mm_mul_ps(fscal,dz22);
609
610             /* Update vectorial force */
611             fix2             = _mm_add_ps(fix2,tx);
612             fiy2             = _mm_add_ps(fiy2,ty);
613             fiz2             = _mm_add_ps(fiz2,tz);
614
615             fjx2             = _mm_add_ps(fjx2,tx);
616             fjy2             = _mm_add_ps(fjy2,ty);
617             fjz2             = _mm_add_ps(fjz2,tz);
618
619             }
620
621             fjptrA             = f+j_coord_offsetA;
622             fjptrB             = f+j_coord_offsetB;
623             fjptrC             = f+j_coord_offsetC;
624             fjptrD             = f+j_coord_offsetD;
625
626             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
627                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
628
629             /* Inner loop uses 324 flops */
630         }
631
632         if(jidx<j_index_end)
633         {
634
635             /* Get j neighbor index, and coordinate index */
636             jnrlistA         = jjnr[jidx];
637             jnrlistB         = jjnr[jidx+1];
638             jnrlistC         = jjnr[jidx+2];
639             jnrlistD         = jjnr[jidx+3];
640             /* Sign of each element will be negative for non-real atoms.
641              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
642              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
643              */
644             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
645             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
646             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
647             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
648             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
649             j_coord_offsetA  = DIM*jnrA;
650             j_coord_offsetB  = DIM*jnrB;
651             j_coord_offsetC  = DIM*jnrC;
652             j_coord_offsetD  = DIM*jnrD;
653
654             /* load j atom coordinates */
655             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
656                                               x+j_coord_offsetC,x+j_coord_offsetD,
657                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
658
659             /* Calculate displacement vector */
660             dx00             = _mm_sub_ps(ix0,jx0);
661             dy00             = _mm_sub_ps(iy0,jy0);
662             dz00             = _mm_sub_ps(iz0,jz0);
663             dx01             = _mm_sub_ps(ix0,jx1);
664             dy01             = _mm_sub_ps(iy0,jy1);
665             dz01             = _mm_sub_ps(iz0,jz1);
666             dx02             = _mm_sub_ps(ix0,jx2);
667             dy02             = _mm_sub_ps(iy0,jy2);
668             dz02             = _mm_sub_ps(iz0,jz2);
669             dx10             = _mm_sub_ps(ix1,jx0);
670             dy10             = _mm_sub_ps(iy1,jy0);
671             dz10             = _mm_sub_ps(iz1,jz0);
672             dx11             = _mm_sub_ps(ix1,jx1);
673             dy11             = _mm_sub_ps(iy1,jy1);
674             dz11             = _mm_sub_ps(iz1,jz1);
675             dx12             = _mm_sub_ps(ix1,jx2);
676             dy12             = _mm_sub_ps(iy1,jy2);
677             dz12             = _mm_sub_ps(iz1,jz2);
678             dx20             = _mm_sub_ps(ix2,jx0);
679             dy20             = _mm_sub_ps(iy2,jy0);
680             dz20             = _mm_sub_ps(iz2,jz0);
681             dx21             = _mm_sub_ps(ix2,jx1);
682             dy21             = _mm_sub_ps(iy2,jy1);
683             dz21             = _mm_sub_ps(iz2,jz1);
684             dx22             = _mm_sub_ps(ix2,jx2);
685             dy22             = _mm_sub_ps(iy2,jy2);
686             dz22             = _mm_sub_ps(iz2,jz2);
687
688             /* Calculate squared distance and things based on it */
689             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
690             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
691             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
692             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
693             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
694             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
695             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
696             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
697             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
698
699             rinv00           = gmx_mm_invsqrt_ps(rsq00);
700             rinv01           = gmx_mm_invsqrt_ps(rsq01);
701             rinv02           = gmx_mm_invsqrt_ps(rsq02);
702             rinv10           = gmx_mm_invsqrt_ps(rsq10);
703             rinv11           = gmx_mm_invsqrt_ps(rsq11);
704             rinv12           = gmx_mm_invsqrt_ps(rsq12);
705             rinv20           = gmx_mm_invsqrt_ps(rsq20);
706             rinv21           = gmx_mm_invsqrt_ps(rsq21);
707             rinv22           = gmx_mm_invsqrt_ps(rsq22);
708
709             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
710             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
711             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
712             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
713             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
714             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
715             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
716             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
717             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
718
719             fjx0             = _mm_setzero_ps();
720             fjy0             = _mm_setzero_ps();
721             fjz0             = _mm_setzero_ps();
722             fjx1             = _mm_setzero_ps();
723             fjy1             = _mm_setzero_ps();
724             fjz1             = _mm_setzero_ps();
725             fjx2             = _mm_setzero_ps();
726             fjy2             = _mm_setzero_ps();
727             fjz2             = _mm_setzero_ps();
728
729             /**************************
730              * CALCULATE INTERACTIONS *
731              **************************/
732
733             if (gmx_mm_any_lt(rsq00,rcutoff2))
734             {
735
736             /* REACTION-FIELD ELECTROSTATICS */
737             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
738             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
739
740             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
741
742             /* Update potential sum for this i atom from the interaction with this j atom. */
743             velec            = _mm_and_ps(velec,cutoff_mask);
744             velec            = _mm_andnot_ps(dummy_mask,velec);
745             velecsum         = _mm_add_ps(velecsum,velec);
746
747             fscal            = felec;
748
749             fscal            = _mm_and_ps(fscal,cutoff_mask);
750
751             fscal            = _mm_andnot_ps(dummy_mask,fscal);
752
753             /* Calculate temporary vectorial force */
754             tx               = _mm_mul_ps(fscal,dx00);
755             ty               = _mm_mul_ps(fscal,dy00);
756             tz               = _mm_mul_ps(fscal,dz00);
757
758             /* Update vectorial force */
759             fix0             = _mm_add_ps(fix0,tx);
760             fiy0             = _mm_add_ps(fiy0,ty);
761             fiz0             = _mm_add_ps(fiz0,tz);
762
763             fjx0             = _mm_add_ps(fjx0,tx);
764             fjy0             = _mm_add_ps(fjy0,ty);
765             fjz0             = _mm_add_ps(fjz0,tz);
766
767             }
768
769             /**************************
770              * CALCULATE INTERACTIONS *
771              **************************/
772
773             if (gmx_mm_any_lt(rsq01,rcutoff2))
774             {
775
776             /* REACTION-FIELD ELECTROSTATICS */
777             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
778             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
779
780             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
781
782             /* Update potential sum for this i atom from the interaction with this j atom. */
783             velec            = _mm_and_ps(velec,cutoff_mask);
784             velec            = _mm_andnot_ps(dummy_mask,velec);
785             velecsum         = _mm_add_ps(velecsum,velec);
786
787             fscal            = felec;
788
789             fscal            = _mm_and_ps(fscal,cutoff_mask);
790
791             fscal            = _mm_andnot_ps(dummy_mask,fscal);
792
793             /* Calculate temporary vectorial force */
794             tx               = _mm_mul_ps(fscal,dx01);
795             ty               = _mm_mul_ps(fscal,dy01);
796             tz               = _mm_mul_ps(fscal,dz01);
797
798             /* Update vectorial force */
799             fix0             = _mm_add_ps(fix0,tx);
800             fiy0             = _mm_add_ps(fiy0,ty);
801             fiz0             = _mm_add_ps(fiz0,tz);
802
803             fjx1             = _mm_add_ps(fjx1,tx);
804             fjy1             = _mm_add_ps(fjy1,ty);
805             fjz1             = _mm_add_ps(fjz1,tz);
806
807             }
808
809             /**************************
810              * CALCULATE INTERACTIONS *
811              **************************/
812
813             if (gmx_mm_any_lt(rsq02,rcutoff2))
814             {
815
816             /* REACTION-FIELD ELECTROSTATICS */
817             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
818             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
819
820             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
821
822             /* Update potential sum for this i atom from the interaction with this j atom. */
823             velec            = _mm_and_ps(velec,cutoff_mask);
824             velec            = _mm_andnot_ps(dummy_mask,velec);
825             velecsum         = _mm_add_ps(velecsum,velec);
826
827             fscal            = felec;
828
829             fscal            = _mm_and_ps(fscal,cutoff_mask);
830
831             fscal            = _mm_andnot_ps(dummy_mask,fscal);
832
833             /* Calculate temporary vectorial force */
834             tx               = _mm_mul_ps(fscal,dx02);
835             ty               = _mm_mul_ps(fscal,dy02);
836             tz               = _mm_mul_ps(fscal,dz02);
837
838             /* Update vectorial force */
839             fix0             = _mm_add_ps(fix0,tx);
840             fiy0             = _mm_add_ps(fiy0,ty);
841             fiz0             = _mm_add_ps(fiz0,tz);
842
843             fjx2             = _mm_add_ps(fjx2,tx);
844             fjy2             = _mm_add_ps(fjy2,ty);
845             fjz2             = _mm_add_ps(fjz2,tz);
846
847             }
848
849             /**************************
850              * CALCULATE INTERACTIONS *
851              **************************/
852
853             if (gmx_mm_any_lt(rsq10,rcutoff2))
854             {
855
856             /* REACTION-FIELD ELECTROSTATICS */
857             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
858             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
859
860             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
861
862             /* Update potential sum for this i atom from the interaction with this j atom. */
863             velec            = _mm_and_ps(velec,cutoff_mask);
864             velec            = _mm_andnot_ps(dummy_mask,velec);
865             velecsum         = _mm_add_ps(velecsum,velec);
866
867             fscal            = felec;
868
869             fscal            = _mm_and_ps(fscal,cutoff_mask);
870
871             fscal            = _mm_andnot_ps(dummy_mask,fscal);
872
873             /* Calculate temporary vectorial force */
874             tx               = _mm_mul_ps(fscal,dx10);
875             ty               = _mm_mul_ps(fscal,dy10);
876             tz               = _mm_mul_ps(fscal,dz10);
877
878             /* Update vectorial force */
879             fix1             = _mm_add_ps(fix1,tx);
880             fiy1             = _mm_add_ps(fiy1,ty);
881             fiz1             = _mm_add_ps(fiz1,tz);
882
883             fjx0             = _mm_add_ps(fjx0,tx);
884             fjy0             = _mm_add_ps(fjy0,ty);
885             fjz0             = _mm_add_ps(fjz0,tz);
886
887             }
888
889             /**************************
890              * CALCULATE INTERACTIONS *
891              **************************/
892
893             if (gmx_mm_any_lt(rsq11,rcutoff2))
894             {
895
896             /* REACTION-FIELD ELECTROSTATICS */
897             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
898             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
899
900             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
901
902             /* Update potential sum for this i atom from the interaction with this j atom. */
903             velec            = _mm_and_ps(velec,cutoff_mask);
904             velec            = _mm_andnot_ps(dummy_mask,velec);
905             velecsum         = _mm_add_ps(velecsum,velec);
906
907             fscal            = felec;
908
909             fscal            = _mm_and_ps(fscal,cutoff_mask);
910
911             fscal            = _mm_andnot_ps(dummy_mask,fscal);
912
913             /* Calculate temporary vectorial force */
914             tx               = _mm_mul_ps(fscal,dx11);
915             ty               = _mm_mul_ps(fscal,dy11);
916             tz               = _mm_mul_ps(fscal,dz11);
917
918             /* Update vectorial force */
919             fix1             = _mm_add_ps(fix1,tx);
920             fiy1             = _mm_add_ps(fiy1,ty);
921             fiz1             = _mm_add_ps(fiz1,tz);
922
923             fjx1             = _mm_add_ps(fjx1,tx);
924             fjy1             = _mm_add_ps(fjy1,ty);
925             fjz1             = _mm_add_ps(fjz1,tz);
926
927             }
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             if (gmx_mm_any_lt(rsq12,rcutoff2))
934             {
935
936             /* REACTION-FIELD ELECTROSTATICS */
937             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
938             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
939
940             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
941
942             /* Update potential sum for this i atom from the interaction with this j atom. */
943             velec            = _mm_and_ps(velec,cutoff_mask);
944             velec            = _mm_andnot_ps(dummy_mask,velec);
945             velecsum         = _mm_add_ps(velecsum,velec);
946
947             fscal            = felec;
948
949             fscal            = _mm_and_ps(fscal,cutoff_mask);
950
951             fscal            = _mm_andnot_ps(dummy_mask,fscal);
952
953             /* Calculate temporary vectorial force */
954             tx               = _mm_mul_ps(fscal,dx12);
955             ty               = _mm_mul_ps(fscal,dy12);
956             tz               = _mm_mul_ps(fscal,dz12);
957
958             /* Update vectorial force */
959             fix1             = _mm_add_ps(fix1,tx);
960             fiy1             = _mm_add_ps(fiy1,ty);
961             fiz1             = _mm_add_ps(fiz1,tz);
962
963             fjx2             = _mm_add_ps(fjx2,tx);
964             fjy2             = _mm_add_ps(fjy2,ty);
965             fjz2             = _mm_add_ps(fjz2,tz);
966
967             }
968
969             /**************************
970              * CALCULATE INTERACTIONS *
971              **************************/
972
973             if (gmx_mm_any_lt(rsq20,rcutoff2))
974             {
975
976             /* REACTION-FIELD ELECTROSTATICS */
977             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
978             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
979
980             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
981
982             /* Update potential sum for this i atom from the interaction with this j atom. */
983             velec            = _mm_and_ps(velec,cutoff_mask);
984             velec            = _mm_andnot_ps(dummy_mask,velec);
985             velecsum         = _mm_add_ps(velecsum,velec);
986
987             fscal            = felec;
988
989             fscal            = _mm_and_ps(fscal,cutoff_mask);
990
991             fscal            = _mm_andnot_ps(dummy_mask,fscal);
992
993             /* Calculate temporary vectorial force */
994             tx               = _mm_mul_ps(fscal,dx20);
995             ty               = _mm_mul_ps(fscal,dy20);
996             tz               = _mm_mul_ps(fscal,dz20);
997
998             /* Update vectorial force */
999             fix2             = _mm_add_ps(fix2,tx);
1000             fiy2             = _mm_add_ps(fiy2,ty);
1001             fiz2             = _mm_add_ps(fiz2,tz);
1002
1003             fjx0             = _mm_add_ps(fjx0,tx);
1004             fjy0             = _mm_add_ps(fjy0,ty);
1005             fjz0             = _mm_add_ps(fjz0,tz);
1006
1007             }
1008
1009             /**************************
1010              * CALCULATE INTERACTIONS *
1011              **************************/
1012
1013             if (gmx_mm_any_lt(rsq21,rcutoff2))
1014             {
1015
1016             /* REACTION-FIELD ELECTROSTATICS */
1017             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1018             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1019
1020             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1021
1022             /* Update potential sum for this i atom from the interaction with this j atom. */
1023             velec            = _mm_and_ps(velec,cutoff_mask);
1024             velec            = _mm_andnot_ps(dummy_mask,velec);
1025             velecsum         = _mm_add_ps(velecsum,velec);
1026
1027             fscal            = felec;
1028
1029             fscal            = _mm_and_ps(fscal,cutoff_mask);
1030
1031             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1032
1033             /* Calculate temporary vectorial force */
1034             tx               = _mm_mul_ps(fscal,dx21);
1035             ty               = _mm_mul_ps(fscal,dy21);
1036             tz               = _mm_mul_ps(fscal,dz21);
1037
1038             /* Update vectorial force */
1039             fix2             = _mm_add_ps(fix2,tx);
1040             fiy2             = _mm_add_ps(fiy2,ty);
1041             fiz2             = _mm_add_ps(fiz2,tz);
1042
1043             fjx1             = _mm_add_ps(fjx1,tx);
1044             fjy1             = _mm_add_ps(fjy1,ty);
1045             fjz1             = _mm_add_ps(fjz1,tz);
1046
1047             }
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             if (gmx_mm_any_lt(rsq22,rcutoff2))
1054             {
1055
1056             /* REACTION-FIELD ELECTROSTATICS */
1057             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1058             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1059
1060             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1061
1062             /* Update potential sum for this i atom from the interaction with this j atom. */
1063             velec            = _mm_and_ps(velec,cutoff_mask);
1064             velec            = _mm_andnot_ps(dummy_mask,velec);
1065             velecsum         = _mm_add_ps(velecsum,velec);
1066
1067             fscal            = felec;
1068
1069             fscal            = _mm_and_ps(fscal,cutoff_mask);
1070
1071             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1072
1073             /* Calculate temporary vectorial force */
1074             tx               = _mm_mul_ps(fscal,dx22);
1075             ty               = _mm_mul_ps(fscal,dy22);
1076             tz               = _mm_mul_ps(fscal,dz22);
1077
1078             /* Update vectorial force */
1079             fix2             = _mm_add_ps(fix2,tx);
1080             fiy2             = _mm_add_ps(fiy2,ty);
1081             fiz2             = _mm_add_ps(fiz2,tz);
1082
1083             fjx2             = _mm_add_ps(fjx2,tx);
1084             fjy2             = _mm_add_ps(fjy2,ty);
1085             fjz2             = _mm_add_ps(fjz2,tz);
1086
1087             }
1088
1089             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1090             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1091             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1092             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1093
1094             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1095                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1096
1097             /* Inner loop uses 324 flops */
1098         }
1099
1100         /* End of innermost loop */
1101
1102         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1103                                               f+i_coord_offset,fshift+i_shift_offset);
1104
1105         ggid                        = gid[iidx];
1106         /* Update potential energies */
1107         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1108
1109         /* Increment number of inner iterations */
1110         inneriter                  += j_index_end - j_index_start;
1111
1112         /* Outer loop uses 19 flops */
1113     }
1114
1115     /* Increment number of outer iterations */
1116     outeriter        += nri;
1117
1118     /* Update outer/inner flops */
1119
1120     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*324);
1121 }
1122 /*
1123  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_single
1124  * Electrostatics interaction: ReactionField
1125  * VdW interaction:            None
1126  * Geometry:                   Water3-Water3
1127  * Calculate force/pot:        Force
1128  */
1129 void
1130 nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_single
1131                     (t_nblist                    * gmx_restrict       nlist,
1132                      rvec                        * gmx_restrict          xx,
1133                      rvec                        * gmx_restrict          ff,
1134                      t_forcerec                  * gmx_restrict          fr,
1135                      t_mdatoms                   * gmx_restrict     mdatoms,
1136                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1137                      t_nrnb                      * gmx_restrict        nrnb)
1138 {
1139     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1140      * just 0 for non-waters.
1141      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1142      * jnr indices corresponding to data put in the four positions in the SIMD register.
1143      */
1144     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1145     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1146     int              jnrA,jnrB,jnrC,jnrD;
1147     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1148     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1149     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1150     real             rcutoff_scalar;
1151     real             *shiftvec,*fshift,*x,*f;
1152     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1153     real             scratch[4*DIM];
1154     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1155     int              vdwioffset0;
1156     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1157     int              vdwioffset1;
1158     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1159     int              vdwioffset2;
1160     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1161     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1162     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1163     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1164     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1165     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1166     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1167     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1168     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1169     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1170     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1171     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1172     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1173     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1174     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1175     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1176     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1177     real             *charge;
1178     __m128           dummy_mask,cutoff_mask;
1179     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1180     __m128           one     = _mm_set1_ps(1.0);
1181     __m128           two     = _mm_set1_ps(2.0);
1182     x                = xx[0];
1183     f                = ff[0];
1184
1185     nri              = nlist->nri;
1186     iinr             = nlist->iinr;
1187     jindex           = nlist->jindex;
1188     jjnr             = nlist->jjnr;
1189     shiftidx         = nlist->shift;
1190     gid              = nlist->gid;
1191     shiftvec         = fr->shift_vec[0];
1192     fshift           = fr->fshift[0];
1193     facel            = _mm_set1_ps(fr->epsfac);
1194     charge           = mdatoms->chargeA;
1195     krf              = _mm_set1_ps(fr->ic->k_rf);
1196     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1197     crf              = _mm_set1_ps(fr->ic->c_rf);
1198
1199     /* Setup water-specific parameters */
1200     inr              = nlist->iinr[0];
1201     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1202     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1203     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1204
1205     jq0              = _mm_set1_ps(charge[inr+0]);
1206     jq1              = _mm_set1_ps(charge[inr+1]);
1207     jq2              = _mm_set1_ps(charge[inr+2]);
1208     qq00             = _mm_mul_ps(iq0,jq0);
1209     qq01             = _mm_mul_ps(iq0,jq1);
1210     qq02             = _mm_mul_ps(iq0,jq2);
1211     qq10             = _mm_mul_ps(iq1,jq0);
1212     qq11             = _mm_mul_ps(iq1,jq1);
1213     qq12             = _mm_mul_ps(iq1,jq2);
1214     qq20             = _mm_mul_ps(iq2,jq0);
1215     qq21             = _mm_mul_ps(iq2,jq1);
1216     qq22             = _mm_mul_ps(iq2,jq2);
1217
1218     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1219     rcutoff_scalar   = fr->rcoulomb;
1220     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1221     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1222
1223     /* Avoid stupid compiler warnings */
1224     jnrA = jnrB = jnrC = jnrD = 0;
1225     j_coord_offsetA = 0;
1226     j_coord_offsetB = 0;
1227     j_coord_offsetC = 0;
1228     j_coord_offsetD = 0;
1229
1230     outeriter        = 0;
1231     inneriter        = 0;
1232
1233     for(iidx=0;iidx<4*DIM;iidx++)
1234     {
1235         scratch[iidx] = 0.0;
1236     }
1237
1238     /* Start outer loop over neighborlists */
1239     for(iidx=0; iidx<nri; iidx++)
1240     {
1241         /* Load shift vector for this list */
1242         i_shift_offset   = DIM*shiftidx[iidx];
1243
1244         /* Load limits for loop over neighbors */
1245         j_index_start    = jindex[iidx];
1246         j_index_end      = jindex[iidx+1];
1247
1248         /* Get outer coordinate index */
1249         inr              = iinr[iidx];
1250         i_coord_offset   = DIM*inr;
1251
1252         /* Load i particle coords and add shift vector */
1253         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1254                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1255
1256         fix0             = _mm_setzero_ps();
1257         fiy0             = _mm_setzero_ps();
1258         fiz0             = _mm_setzero_ps();
1259         fix1             = _mm_setzero_ps();
1260         fiy1             = _mm_setzero_ps();
1261         fiz1             = _mm_setzero_ps();
1262         fix2             = _mm_setzero_ps();
1263         fiy2             = _mm_setzero_ps();
1264         fiz2             = _mm_setzero_ps();
1265
1266         /* Start inner kernel loop */
1267         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1268         {
1269
1270             /* Get j neighbor index, and coordinate index */
1271             jnrA             = jjnr[jidx];
1272             jnrB             = jjnr[jidx+1];
1273             jnrC             = jjnr[jidx+2];
1274             jnrD             = jjnr[jidx+3];
1275             j_coord_offsetA  = DIM*jnrA;
1276             j_coord_offsetB  = DIM*jnrB;
1277             j_coord_offsetC  = DIM*jnrC;
1278             j_coord_offsetD  = DIM*jnrD;
1279
1280             /* load j atom coordinates */
1281             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1282                                               x+j_coord_offsetC,x+j_coord_offsetD,
1283                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1284
1285             /* Calculate displacement vector */
1286             dx00             = _mm_sub_ps(ix0,jx0);
1287             dy00             = _mm_sub_ps(iy0,jy0);
1288             dz00             = _mm_sub_ps(iz0,jz0);
1289             dx01             = _mm_sub_ps(ix0,jx1);
1290             dy01             = _mm_sub_ps(iy0,jy1);
1291             dz01             = _mm_sub_ps(iz0,jz1);
1292             dx02             = _mm_sub_ps(ix0,jx2);
1293             dy02             = _mm_sub_ps(iy0,jy2);
1294             dz02             = _mm_sub_ps(iz0,jz2);
1295             dx10             = _mm_sub_ps(ix1,jx0);
1296             dy10             = _mm_sub_ps(iy1,jy0);
1297             dz10             = _mm_sub_ps(iz1,jz0);
1298             dx11             = _mm_sub_ps(ix1,jx1);
1299             dy11             = _mm_sub_ps(iy1,jy1);
1300             dz11             = _mm_sub_ps(iz1,jz1);
1301             dx12             = _mm_sub_ps(ix1,jx2);
1302             dy12             = _mm_sub_ps(iy1,jy2);
1303             dz12             = _mm_sub_ps(iz1,jz2);
1304             dx20             = _mm_sub_ps(ix2,jx0);
1305             dy20             = _mm_sub_ps(iy2,jy0);
1306             dz20             = _mm_sub_ps(iz2,jz0);
1307             dx21             = _mm_sub_ps(ix2,jx1);
1308             dy21             = _mm_sub_ps(iy2,jy1);
1309             dz21             = _mm_sub_ps(iz2,jz1);
1310             dx22             = _mm_sub_ps(ix2,jx2);
1311             dy22             = _mm_sub_ps(iy2,jy2);
1312             dz22             = _mm_sub_ps(iz2,jz2);
1313
1314             /* Calculate squared distance and things based on it */
1315             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1316             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1317             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1318             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1319             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1320             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1321             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1322             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1323             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1324
1325             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1326             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1327             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1328             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1329             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1330             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1331             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1332             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1333             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1334
1335             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1336             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1337             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1338             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1339             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1340             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1341             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1342             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1343             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1344
1345             fjx0             = _mm_setzero_ps();
1346             fjy0             = _mm_setzero_ps();
1347             fjz0             = _mm_setzero_ps();
1348             fjx1             = _mm_setzero_ps();
1349             fjy1             = _mm_setzero_ps();
1350             fjz1             = _mm_setzero_ps();
1351             fjx2             = _mm_setzero_ps();
1352             fjy2             = _mm_setzero_ps();
1353             fjz2             = _mm_setzero_ps();
1354
1355             /**************************
1356              * CALCULATE INTERACTIONS *
1357              **************************/
1358
1359             if (gmx_mm_any_lt(rsq00,rcutoff2))
1360             {
1361
1362             /* REACTION-FIELD ELECTROSTATICS */
1363             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1364
1365             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1366
1367             fscal            = felec;
1368
1369             fscal            = _mm_and_ps(fscal,cutoff_mask);
1370
1371             /* Calculate temporary vectorial force */
1372             tx               = _mm_mul_ps(fscal,dx00);
1373             ty               = _mm_mul_ps(fscal,dy00);
1374             tz               = _mm_mul_ps(fscal,dz00);
1375
1376             /* Update vectorial force */
1377             fix0             = _mm_add_ps(fix0,tx);
1378             fiy0             = _mm_add_ps(fiy0,ty);
1379             fiz0             = _mm_add_ps(fiz0,tz);
1380
1381             fjx0             = _mm_add_ps(fjx0,tx);
1382             fjy0             = _mm_add_ps(fjy0,ty);
1383             fjz0             = _mm_add_ps(fjz0,tz);
1384
1385             }
1386
1387             /**************************
1388              * CALCULATE INTERACTIONS *
1389              **************************/
1390
1391             if (gmx_mm_any_lt(rsq01,rcutoff2))
1392             {
1393
1394             /* REACTION-FIELD ELECTROSTATICS */
1395             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1396
1397             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1398
1399             fscal            = felec;
1400
1401             fscal            = _mm_and_ps(fscal,cutoff_mask);
1402
1403             /* Calculate temporary vectorial force */
1404             tx               = _mm_mul_ps(fscal,dx01);
1405             ty               = _mm_mul_ps(fscal,dy01);
1406             tz               = _mm_mul_ps(fscal,dz01);
1407
1408             /* Update vectorial force */
1409             fix0             = _mm_add_ps(fix0,tx);
1410             fiy0             = _mm_add_ps(fiy0,ty);
1411             fiz0             = _mm_add_ps(fiz0,tz);
1412
1413             fjx1             = _mm_add_ps(fjx1,tx);
1414             fjy1             = _mm_add_ps(fjy1,ty);
1415             fjz1             = _mm_add_ps(fjz1,tz);
1416
1417             }
1418
1419             /**************************
1420              * CALCULATE INTERACTIONS *
1421              **************************/
1422
1423             if (gmx_mm_any_lt(rsq02,rcutoff2))
1424             {
1425
1426             /* REACTION-FIELD ELECTROSTATICS */
1427             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1428
1429             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1430
1431             fscal            = felec;
1432
1433             fscal            = _mm_and_ps(fscal,cutoff_mask);
1434
1435             /* Calculate temporary vectorial force */
1436             tx               = _mm_mul_ps(fscal,dx02);
1437             ty               = _mm_mul_ps(fscal,dy02);
1438             tz               = _mm_mul_ps(fscal,dz02);
1439
1440             /* Update vectorial force */
1441             fix0             = _mm_add_ps(fix0,tx);
1442             fiy0             = _mm_add_ps(fiy0,ty);
1443             fiz0             = _mm_add_ps(fiz0,tz);
1444
1445             fjx2             = _mm_add_ps(fjx2,tx);
1446             fjy2             = _mm_add_ps(fjy2,ty);
1447             fjz2             = _mm_add_ps(fjz2,tz);
1448
1449             }
1450
1451             /**************************
1452              * CALCULATE INTERACTIONS *
1453              **************************/
1454
1455             if (gmx_mm_any_lt(rsq10,rcutoff2))
1456             {
1457
1458             /* REACTION-FIELD ELECTROSTATICS */
1459             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1460
1461             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1462
1463             fscal            = felec;
1464
1465             fscal            = _mm_and_ps(fscal,cutoff_mask);
1466
1467             /* Calculate temporary vectorial force */
1468             tx               = _mm_mul_ps(fscal,dx10);
1469             ty               = _mm_mul_ps(fscal,dy10);
1470             tz               = _mm_mul_ps(fscal,dz10);
1471
1472             /* Update vectorial force */
1473             fix1             = _mm_add_ps(fix1,tx);
1474             fiy1             = _mm_add_ps(fiy1,ty);
1475             fiz1             = _mm_add_ps(fiz1,tz);
1476
1477             fjx0             = _mm_add_ps(fjx0,tx);
1478             fjy0             = _mm_add_ps(fjy0,ty);
1479             fjz0             = _mm_add_ps(fjz0,tz);
1480
1481             }
1482
1483             /**************************
1484              * CALCULATE INTERACTIONS *
1485              **************************/
1486
1487             if (gmx_mm_any_lt(rsq11,rcutoff2))
1488             {
1489
1490             /* REACTION-FIELD ELECTROSTATICS */
1491             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1492
1493             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1494
1495             fscal            = felec;
1496
1497             fscal            = _mm_and_ps(fscal,cutoff_mask);
1498
1499             /* Calculate temporary vectorial force */
1500             tx               = _mm_mul_ps(fscal,dx11);
1501             ty               = _mm_mul_ps(fscal,dy11);
1502             tz               = _mm_mul_ps(fscal,dz11);
1503
1504             /* Update vectorial force */
1505             fix1             = _mm_add_ps(fix1,tx);
1506             fiy1             = _mm_add_ps(fiy1,ty);
1507             fiz1             = _mm_add_ps(fiz1,tz);
1508
1509             fjx1             = _mm_add_ps(fjx1,tx);
1510             fjy1             = _mm_add_ps(fjy1,ty);
1511             fjz1             = _mm_add_ps(fjz1,tz);
1512
1513             }
1514
1515             /**************************
1516              * CALCULATE INTERACTIONS *
1517              **************************/
1518
1519             if (gmx_mm_any_lt(rsq12,rcutoff2))
1520             {
1521
1522             /* REACTION-FIELD ELECTROSTATICS */
1523             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1524
1525             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1526
1527             fscal            = felec;
1528
1529             fscal            = _mm_and_ps(fscal,cutoff_mask);
1530
1531             /* Calculate temporary vectorial force */
1532             tx               = _mm_mul_ps(fscal,dx12);
1533             ty               = _mm_mul_ps(fscal,dy12);
1534             tz               = _mm_mul_ps(fscal,dz12);
1535
1536             /* Update vectorial force */
1537             fix1             = _mm_add_ps(fix1,tx);
1538             fiy1             = _mm_add_ps(fiy1,ty);
1539             fiz1             = _mm_add_ps(fiz1,tz);
1540
1541             fjx2             = _mm_add_ps(fjx2,tx);
1542             fjy2             = _mm_add_ps(fjy2,ty);
1543             fjz2             = _mm_add_ps(fjz2,tz);
1544
1545             }
1546
1547             /**************************
1548              * CALCULATE INTERACTIONS *
1549              **************************/
1550
1551             if (gmx_mm_any_lt(rsq20,rcutoff2))
1552             {
1553
1554             /* REACTION-FIELD ELECTROSTATICS */
1555             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1556
1557             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1558
1559             fscal            = felec;
1560
1561             fscal            = _mm_and_ps(fscal,cutoff_mask);
1562
1563             /* Calculate temporary vectorial force */
1564             tx               = _mm_mul_ps(fscal,dx20);
1565             ty               = _mm_mul_ps(fscal,dy20);
1566             tz               = _mm_mul_ps(fscal,dz20);
1567
1568             /* Update vectorial force */
1569             fix2             = _mm_add_ps(fix2,tx);
1570             fiy2             = _mm_add_ps(fiy2,ty);
1571             fiz2             = _mm_add_ps(fiz2,tz);
1572
1573             fjx0             = _mm_add_ps(fjx0,tx);
1574             fjy0             = _mm_add_ps(fjy0,ty);
1575             fjz0             = _mm_add_ps(fjz0,tz);
1576
1577             }
1578
1579             /**************************
1580              * CALCULATE INTERACTIONS *
1581              **************************/
1582
1583             if (gmx_mm_any_lt(rsq21,rcutoff2))
1584             {
1585
1586             /* REACTION-FIELD ELECTROSTATICS */
1587             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1588
1589             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1590
1591             fscal            = felec;
1592
1593             fscal            = _mm_and_ps(fscal,cutoff_mask);
1594
1595             /* Calculate temporary vectorial force */
1596             tx               = _mm_mul_ps(fscal,dx21);
1597             ty               = _mm_mul_ps(fscal,dy21);
1598             tz               = _mm_mul_ps(fscal,dz21);
1599
1600             /* Update vectorial force */
1601             fix2             = _mm_add_ps(fix2,tx);
1602             fiy2             = _mm_add_ps(fiy2,ty);
1603             fiz2             = _mm_add_ps(fiz2,tz);
1604
1605             fjx1             = _mm_add_ps(fjx1,tx);
1606             fjy1             = _mm_add_ps(fjy1,ty);
1607             fjz1             = _mm_add_ps(fjz1,tz);
1608
1609             }
1610
1611             /**************************
1612              * CALCULATE INTERACTIONS *
1613              **************************/
1614
1615             if (gmx_mm_any_lt(rsq22,rcutoff2))
1616             {
1617
1618             /* REACTION-FIELD ELECTROSTATICS */
1619             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1620
1621             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1622
1623             fscal            = felec;
1624
1625             fscal            = _mm_and_ps(fscal,cutoff_mask);
1626
1627             /* Calculate temporary vectorial force */
1628             tx               = _mm_mul_ps(fscal,dx22);
1629             ty               = _mm_mul_ps(fscal,dy22);
1630             tz               = _mm_mul_ps(fscal,dz22);
1631
1632             /* Update vectorial force */
1633             fix2             = _mm_add_ps(fix2,tx);
1634             fiy2             = _mm_add_ps(fiy2,ty);
1635             fiz2             = _mm_add_ps(fiz2,tz);
1636
1637             fjx2             = _mm_add_ps(fjx2,tx);
1638             fjy2             = _mm_add_ps(fjy2,ty);
1639             fjz2             = _mm_add_ps(fjz2,tz);
1640
1641             }
1642
1643             fjptrA             = f+j_coord_offsetA;
1644             fjptrB             = f+j_coord_offsetB;
1645             fjptrC             = f+j_coord_offsetC;
1646             fjptrD             = f+j_coord_offsetD;
1647
1648             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1649                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1650
1651             /* Inner loop uses 270 flops */
1652         }
1653
1654         if(jidx<j_index_end)
1655         {
1656
1657             /* Get j neighbor index, and coordinate index */
1658             jnrlistA         = jjnr[jidx];
1659             jnrlistB         = jjnr[jidx+1];
1660             jnrlistC         = jjnr[jidx+2];
1661             jnrlistD         = jjnr[jidx+3];
1662             /* Sign of each element will be negative for non-real atoms.
1663              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1664              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1665              */
1666             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1667             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1668             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1669             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1670             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1671             j_coord_offsetA  = DIM*jnrA;
1672             j_coord_offsetB  = DIM*jnrB;
1673             j_coord_offsetC  = DIM*jnrC;
1674             j_coord_offsetD  = DIM*jnrD;
1675
1676             /* load j atom coordinates */
1677             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1678                                               x+j_coord_offsetC,x+j_coord_offsetD,
1679                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1680
1681             /* Calculate displacement vector */
1682             dx00             = _mm_sub_ps(ix0,jx0);
1683             dy00             = _mm_sub_ps(iy0,jy0);
1684             dz00             = _mm_sub_ps(iz0,jz0);
1685             dx01             = _mm_sub_ps(ix0,jx1);
1686             dy01             = _mm_sub_ps(iy0,jy1);
1687             dz01             = _mm_sub_ps(iz0,jz1);
1688             dx02             = _mm_sub_ps(ix0,jx2);
1689             dy02             = _mm_sub_ps(iy0,jy2);
1690             dz02             = _mm_sub_ps(iz0,jz2);
1691             dx10             = _mm_sub_ps(ix1,jx0);
1692             dy10             = _mm_sub_ps(iy1,jy0);
1693             dz10             = _mm_sub_ps(iz1,jz0);
1694             dx11             = _mm_sub_ps(ix1,jx1);
1695             dy11             = _mm_sub_ps(iy1,jy1);
1696             dz11             = _mm_sub_ps(iz1,jz1);
1697             dx12             = _mm_sub_ps(ix1,jx2);
1698             dy12             = _mm_sub_ps(iy1,jy2);
1699             dz12             = _mm_sub_ps(iz1,jz2);
1700             dx20             = _mm_sub_ps(ix2,jx0);
1701             dy20             = _mm_sub_ps(iy2,jy0);
1702             dz20             = _mm_sub_ps(iz2,jz0);
1703             dx21             = _mm_sub_ps(ix2,jx1);
1704             dy21             = _mm_sub_ps(iy2,jy1);
1705             dz21             = _mm_sub_ps(iz2,jz1);
1706             dx22             = _mm_sub_ps(ix2,jx2);
1707             dy22             = _mm_sub_ps(iy2,jy2);
1708             dz22             = _mm_sub_ps(iz2,jz2);
1709
1710             /* Calculate squared distance and things based on it */
1711             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1712             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1713             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1714             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1715             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1716             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1717             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1718             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1719             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1720
1721             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1722             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1723             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1724             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1725             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1726             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1727             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1728             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1729             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1730
1731             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1732             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1733             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1734             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1735             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1736             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1737             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1738             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1739             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1740
1741             fjx0             = _mm_setzero_ps();
1742             fjy0             = _mm_setzero_ps();
1743             fjz0             = _mm_setzero_ps();
1744             fjx1             = _mm_setzero_ps();
1745             fjy1             = _mm_setzero_ps();
1746             fjz1             = _mm_setzero_ps();
1747             fjx2             = _mm_setzero_ps();
1748             fjy2             = _mm_setzero_ps();
1749             fjz2             = _mm_setzero_ps();
1750
1751             /**************************
1752              * CALCULATE INTERACTIONS *
1753              **************************/
1754
1755             if (gmx_mm_any_lt(rsq00,rcutoff2))
1756             {
1757
1758             /* REACTION-FIELD ELECTROSTATICS */
1759             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1760
1761             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1762
1763             fscal            = felec;
1764
1765             fscal            = _mm_and_ps(fscal,cutoff_mask);
1766
1767             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1768
1769             /* Calculate temporary vectorial force */
1770             tx               = _mm_mul_ps(fscal,dx00);
1771             ty               = _mm_mul_ps(fscal,dy00);
1772             tz               = _mm_mul_ps(fscal,dz00);
1773
1774             /* Update vectorial force */
1775             fix0             = _mm_add_ps(fix0,tx);
1776             fiy0             = _mm_add_ps(fiy0,ty);
1777             fiz0             = _mm_add_ps(fiz0,tz);
1778
1779             fjx0             = _mm_add_ps(fjx0,tx);
1780             fjy0             = _mm_add_ps(fjy0,ty);
1781             fjz0             = _mm_add_ps(fjz0,tz);
1782
1783             }
1784
1785             /**************************
1786              * CALCULATE INTERACTIONS *
1787              **************************/
1788
1789             if (gmx_mm_any_lt(rsq01,rcutoff2))
1790             {
1791
1792             /* REACTION-FIELD ELECTROSTATICS */
1793             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1794
1795             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1796
1797             fscal            = felec;
1798
1799             fscal            = _mm_and_ps(fscal,cutoff_mask);
1800
1801             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1802
1803             /* Calculate temporary vectorial force */
1804             tx               = _mm_mul_ps(fscal,dx01);
1805             ty               = _mm_mul_ps(fscal,dy01);
1806             tz               = _mm_mul_ps(fscal,dz01);
1807
1808             /* Update vectorial force */
1809             fix0             = _mm_add_ps(fix0,tx);
1810             fiy0             = _mm_add_ps(fiy0,ty);
1811             fiz0             = _mm_add_ps(fiz0,tz);
1812
1813             fjx1             = _mm_add_ps(fjx1,tx);
1814             fjy1             = _mm_add_ps(fjy1,ty);
1815             fjz1             = _mm_add_ps(fjz1,tz);
1816
1817             }
1818
1819             /**************************
1820              * CALCULATE INTERACTIONS *
1821              **************************/
1822
1823             if (gmx_mm_any_lt(rsq02,rcutoff2))
1824             {
1825
1826             /* REACTION-FIELD ELECTROSTATICS */
1827             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1828
1829             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1830
1831             fscal            = felec;
1832
1833             fscal            = _mm_and_ps(fscal,cutoff_mask);
1834
1835             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1836
1837             /* Calculate temporary vectorial force */
1838             tx               = _mm_mul_ps(fscal,dx02);
1839             ty               = _mm_mul_ps(fscal,dy02);
1840             tz               = _mm_mul_ps(fscal,dz02);
1841
1842             /* Update vectorial force */
1843             fix0             = _mm_add_ps(fix0,tx);
1844             fiy0             = _mm_add_ps(fiy0,ty);
1845             fiz0             = _mm_add_ps(fiz0,tz);
1846
1847             fjx2             = _mm_add_ps(fjx2,tx);
1848             fjy2             = _mm_add_ps(fjy2,ty);
1849             fjz2             = _mm_add_ps(fjz2,tz);
1850
1851             }
1852
1853             /**************************
1854              * CALCULATE INTERACTIONS *
1855              **************************/
1856
1857             if (gmx_mm_any_lt(rsq10,rcutoff2))
1858             {
1859
1860             /* REACTION-FIELD ELECTROSTATICS */
1861             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1862
1863             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1864
1865             fscal            = felec;
1866
1867             fscal            = _mm_and_ps(fscal,cutoff_mask);
1868
1869             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1870
1871             /* Calculate temporary vectorial force */
1872             tx               = _mm_mul_ps(fscal,dx10);
1873             ty               = _mm_mul_ps(fscal,dy10);
1874             tz               = _mm_mul_ps(fscal,dz10);
1875
1876             /* Update vectorial force */
1877             fix1             = _mm_add_ps(fix1,tx);
1878             fiy1             = _mm_add_ps(fiy1,ty);
1879             fiz1             = _mm_add_ps(fiz1,tz);
1880
1881             fjx0             = _mm_add_ps(fjx0,tx);
1882             fjy0             = _mm_add_ps(fjy0,ty);
1883             fjz0             = _mm_add_ps(fjz0,tz);
1884
1885             }
1886
1887             /**************************
1888              * CALCULATE INTERACTIONS *
1889              **************************/
1890
1891             if (gmx_mm_any_lt(rsq11,rcutoff2))
1892             {
1893
1894             /* REACTION-FIELD ELECTROSTATICS */
1895             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1896
1897             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1898
1899             fscal            = felec;
1900
1901             fscal            = _mm_and_ps(fscal,cutoff_mask);
1902
1903             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1904
1905             /* Calculate temporary vectorial force */
1906             tx               = _mm_mul_ps(fscal,dx11);
1907             ty               = _mm_mul_ps(fscal,dy11);
1908             tz               = _mm_mul_ps(fscal,dz11);
1909
1910             /* Update vectorial force */
1911             fix1             = _mm_add_ps(fix1,tx);
1912             fiy1             = _mm_add_ps(fiy1,ty);
1913             fiz1             = _mm_add_ps(fiz1,tz);
1914
1915             fjx1             = _mm_add_ps(fjx1,tx);
1916             fjy1             = _mm_add_ps(fjy1,ty);
1917             fjz1             = _mm_add_ps(fjz1,tz);
1918
1919             }
1920
1921             /**************************
1922              * CALCULATE INTERACTIONS *
1923              **************************/
1924
1925             if (gmx_mm_any_lt(rsq12,rcutoff2))
1926             {
1927
1928             /* REACTION-FIELD ELECTROSTATICS */
1929             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1930
1931             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1932
1933             fscal            = felec;
1934
1935             fscal            = _mm_and_ps(fscal,cutoff_mask);
1936
1937             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1938
1939             /* Calculate temporary vectorial force */
1940             tx               = _mm_mul_ps(fscal,dx12);
1941             ty               = _mm_mul_ps(fscal,dy12);
1942             tz               = _mm_mul_ps(fscal,dz12);
1943
1944             /* Update vectorial force */
1945             fix1             = _mm_add_ps(fix1,tx);
1946             fiy1             = _mm_add_ps(fiy1,ty);
1947             fiz1             = _mm_add_ps(fiz1,tz);
1948
1949             fjx2             = _mm_add_ps(fjx2,tx);
1950             fjy2             = _mm_add_ps(fjy2,ty);
1951             fjz2             = _mm_add_ps(fjz2,tz);
1952
1953             }
1954
1955             /**************************
1956              * CALCULATE INTERACTIONS *
1957              **************************/
1958
1959             if (gmx_mm_any_lt(rsq20,rcutoff2))
1960             {
1961
1962             /* REACTION-FIELD ELECTROSTATICS */
1963             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1964
1965             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1966
1967             fscal            = felec;
1968
1969             fscal            = _mm_and_ps(fscal,cutoff_mask);
1970
1971             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1972
1973             /* Calculate temporary vectorial force */
1974             tx               = _mm_mul_ps(fscal,dx20);
1975             ty               = _mm_mul_ps(fscal,dy20);
1976             tz               = _mm_mul_ps(fscal,dz20);
1977
1978             /* Update vectorial force */
1979             fix2             = _mm_add_ps(fix2,tx);
1980             fiy2             = _mm_add_ps(fiy2,ty);
1981             fiz2             = _mm_add_ps(fiz2,tz);
1982
1983             fjx0             = _mm_add_ps(fjx0,tx);
1984             fjy0             = _mm_add_ps(fjy0,ty);
1985             fjz0             = _mm_add_ps(fjz0,tz);
1986
1987             }
1988
1989             /**************************
1990              * CALCULATE INTERACTIONS *
1991              **************************/
1992
1993             if (gmx_mm_any_lt(rsq21,rcutoff2))
1994             {
1995
1996             /* REACTION-FIELD ELECTROSTATICS */
1997             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1998
1999             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2000
2001             fscal            = felec;
2002
2003             fscal            = _mm_and_ps(fscal,cutoff_mask);
2004
2005             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2006
2007             /* Calculate temporary vectorial force */
2008             tx               = _mm_mul_ps(fscal,dx21);
2009             ty               = _mm_mul_ps(fscal,dy21);
2010             tz               = _mm_mul_ps(fscal,dz21);
2011
2012             /* Update vectorial force */
2013             fix2             = _mm_add_ps(fix2,tx);
2014             fiy2             = _mm_add_ps(fiy2,ty);
2015             fiz2             = _mm_add_ps(fiz2,tz);
2016
2017             fjx1             = _mm_add_ps(fjx1,tx);
2018             fjy1             = _mm_add_ps(fjy1,ty);
2019             fjz1             = _mm_add_ps(fjz1,tz);
2020
2021             }
2022
2023             /**************************
2024              * CALCULATE INTERACTIONS *
2025              **************************/
2026
2027             if (gmx_mm_any_lt(rsq22,rcutoff2))
2028             {
2029
2030             /* REACTION-FIELD ELECTROSTATICS */
2031             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2032
2033             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2034
2035             fscal            = felec;
2036
2037             fscal            = _mm_and_ps(fscal,cutoff_mask);
2038
2039             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2040
2041             /* Calculate temporary vectorial force */
2042             tx               = _mm_mul_ps(fscal,dx22);
2043             ty               = _mm_mul_ps(fscal,dy22);
2044             tz               = _mm_mul_ps(fscal,dz22);
2045
2046             /* Update vectorial force */
2047             fix2             = _mm_add_ps(fix2,tx);
2048             fiy2             = _mm_add_ps(fiy2,ty);
2049             fiz2             = _mm_add_ps(fiz2,tz);
2050
2051             fjx2             = _mm_add_ps(fjx2,tx);
2052             fjy2             = _mm_add_ps(fjy2,ty);
2053             fjz2             = _mm_add_ps(fjz2,tz);
2054
2055             }
2056
2057             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2058             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2059             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2060             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2061
2062             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2063                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2064
2065             /* Inner loop uses 270 flops */
2066         }
2067
2068         /* End of innermost loop */
2069
2070         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2071                                               f+i_coord_offset,fshift+i_shift_offset);
2072
2073         /* Increment number of inner iterations */
2074         inneriter                  += j_index_end - j_index_start;
2075
2076         /* Outer loop uses 18 flops */
2077     }
2078
2079     /* Increment number of outer iterations */
2080     outeriter        += nri;
2081
2082     /* Update outer/inner flops */
2083
2084     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
2085 }