Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecRF_VdwLJ_GeomW3W3_sse4_1_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     int              nvdwtype;
93     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
94     int              *vdwtype;
95     real             *vdwparam;
96     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
97     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
98     __m128           dummy_mask,cutoff_mask;
99     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
100     __m128           one     = _mm_set1_ps(1.0);
101     __m128           two     = _mm_set1_ps(2.0);
102     x                = xx[0];
103     f                = ff[0];
104
105     nri              = nlist->nri;
106     iinr             = nlist->iinr;
107     jindex           = nlist->jindex;
108     jjnr             = nlist->jjnr;
109     shiftidx         = nlist->shift;
110     gid              = nlist->gid;
111     shiftvec         = fr->shift_vec[0];
112     fshift           = fr->fshift[0];
113     facel            = _mm_set1_ps(fr->epsfac);
114     charge           = mdatoms->chargeA;
115     krf              = _mm_set1_ps(fr->ic->k_rf);
116     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
117     crf              = _mm_set1_ps(fr->ic->c_rf);
118     nvdwtype         = fr->ntype;
119     vdwparam         = fr->nbfp;
120     vdwtype          = mdatoms->typeA;
121
122     /* Setup water-specific parameters */
123     inr              = nlist->iinr[0];
124     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
125     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
126     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
127     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
128
129     jq0              = _mm_set1_ps(charge[inr+0]);
130     jq1              = _mm_set1_ps(charge[inr+1]);
131     jq2              = _mm_set1_ps(charge[inr+2]);
132     vdwjidx0A        = 2*vdwtype[inr+0];
133     qq00             = _mm_mul_ps(iq0,jq0);
134     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
135     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
136     qq01             = _mm_mul_ps(iq0,jq1);
137     qq02             = _mm_mul_ps(iq0,jq2);
138     qq10             = _mm_mul_ps(iq1,jq0);
139     qq11             = _mm_mul_ps(iq1,jq1);
140     qq12             = _mm_mul_ps(iq1,jq2);
141     qq20             = _mm_mul_ps(iq2,jq0);
142     qq21             = _mm_mul_ps(iq2,jq1);
143     qq22             = _mm_mul_ps(iq2,jq2);
144
145     /* Avoid stupid compiler warnings */
146     jnrA = jnrB = jnrC = jnrD = 0;
147     j_coord_offsetA = 0;
148     j_coord_offsetB = 0;
149     j_coord_offsetC = 0;
150     j_coord_offsetD = 0;
151
152     outeriter        = 0;
153     inneriter        = 0;
154
155     for(iidx=0;iidx<4*DIM;iidx++)
156     {
157         scratch[iidx] = 0.0;
158     }
159
160     /* Start outer loop over neighborlists */
161     for(iidx=0; iidx<nri; iidx++)
162     {
163         /* Load shift vector for this list */
164         i_shift_offset   = DIM*shiftidx[iidx];
165
166         /* Load limits for loop over neighbors */
167         j_index_start    = jindex[iidx];
168         j_index_end      = jindex[iidx+1];
169
170         /* Get outer coordinate index */
171         inr              = iinr[iidx];
172         i_coord_offset   = DIM*inr;
173
174         /* Load i particle coords and add shift vector */
175         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
176                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
177
178         fix0             = _mm_setzero_ps();
179         fiy0             = _mm_setzero_ps();
180         fiz0             = _mm_setzero_ps();
181         fix1             = _mm_setzero_ps();
182         fiy1             = _mm_setzero_ps();
183         fiz1             = _mm_setzero_ps();
184         fix2             = _mm_setzero_ps();
185         fiy2             = _mm_setzero_ps();
186         fiz2             = _mm_setzero_ps();
187
188         /* Reset potential sums */
189         velecsum         = _mm_setzero_ps();
190         vvdwsum          = _mm_setzero_ps();
191
192         /* Start inner kernel loop */
193         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
194         {
195
196             /* Get j neighbor index, and coordinate index */
197             jnrA             = jjnr[jidx];
198             jnrB             = jjnr[jidx+1];
199             jnrC             = jjnr[jidx+2];
200             jnrD             = jjnr[jidx+3];
201             j_coord_offsetA  = DIM*jnrA;
202             j_coord_offsetB  = DIM*jnrB;
203             j_coord_offsetC  = DIM*jnrC;
204             j_coord_offsetD  = DIM*jnrD;
205
206             /* load j atom coordinates */
207             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
208                                               x+j_coord_offsetC,x+j_coord_offsetD,
209                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
210
211             /* Calculate displacement vector */
212             dx00             = _mm_sub_ps(ix0,jx0);
213             dy00             = _mm_sub_ps(iy0,jy0);
214             dz00             = _mm_sub_ps(iz0,jz0);
215             dx01             = _mm_sub_ps(ix0,jx1);
216             dy01             = _mm_sub_ps(iy0,jy1);
217             dz01             = _mm_sub_ps(iz0,jz1);
218             dx02             = _mm_sub_ps(ix0,jx2);
219             dy02             = _mm_sub_ps(iy0,jy2);
220             dz02             = _mm_sub_ps(iz0,jz2);
221             dx10             = _mm_sub_ps(ix1,jx0);
222             dy10             = _mm_sub_ps(iy1,jy0);
223             dz10             = _mm_sub_ps(iz1,jz0);
224             dx11             = _mm_sub_ps(ix1,jx1);
225             dy11             = _mm_sub_ps(iy1,jy1);
226             dz11             = _mm_sub_ps(iz1,jz1);
227             dx12             = _mm_sub_ps(ix1,jx2);
228             dy12             = _mm_sub_ps(iy1,jy2);
229             dz12             = _mm_sub_ps(iz1,jz2);
230             dx20             = _mm_sub_ps(ix2,jx0);
231             dy20             = _mm_sub_ps(iy2,jy0);
232             dz20             = _mm_sub_ps(iz2,jz0);
233             dx21             = _mm_sub_ps(ix2,jx1);
234             dy21             = _mm_sub_ps(iy2,jy1);
235             dz21             = _mm_sub_ps(iz2,jz1);
236             dx22             = _mm_sub_ps(ix2,jx2);
237             dy22             = _mm_sub_ps(iy2,jy2);
238             dz22             = _mm_sub_ps(iz2,jz2);
239
240             /* Calculate squared distance and things based on it */
241             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
242             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
243             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
244             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
245             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
246             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
247             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
248             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
249             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
250
251             rinv00           = gmx_mm_invsqrt_ps(rsq00);
252             rinv01           = gmx_mm_invsqrt_ps(rsq01);
253             rinv02           = gmx_mm_invsqrt_ps(rsq02);
254             rinv10           = gmx_mm_invsqrt_ps(rsq10);
255             rinv11           = gmx_mm_invsqrt_ps(rsq11);
256             rinv12           = gmx_mm_invsqrt_ps(rsq12);
257             rinv20           = gmx_mm_invsqrt_ps(rsq20);
258             rinv21           = gmx_mm_invsqrt_ps(rsq21);
259             rinv22           = gmx_mm_invsqrt_ps(rsq22);
260
261             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
262             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
263             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
264             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
265             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
266             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
267             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
268             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
269             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
270
271             fjx0             = _mm_setzero_ps();
272             fjy0             = _mm_setzero_ps();
273             fjz0             = _mm_setzero_ps();
274             fjx1             = _mm_setzero_ps();
275             fjy1             = _mm_setzero_ps();
276             fjz1             = _mm_setzero_ps();
277             fjx2             = _mm_setzero_ps();
278             fjy2             = _mm_setzero_ps();
279             fjz2             = _mm_setzero_ps();
280
281             /**************************
282              * CALCULATE INTERACTIONS *
283              **************************/
284
285             /* REACTION-FIELD ELECTROSTATICS */
286             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
287             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
288
289             /* LENNARD-JONES DISPERSION/REPULSION */
290
291             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
292             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
293             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
294             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
295             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
296
297             /* Update potential sum for this i atom from the interaction with this j atom. */
298             velecsum         = _mm_add_ps(velecsum,velec);
299             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
300
301             fscal            = _mm_add_ps(felec,fvdw);
302
303             /* Calculate temporary vectorial force */
304             tx               = _mm_mul_ps(fscal,dx00);
305             ty               = _mm_mul_ps(fscal,dy00);
306             tz               = _mm_mul_ps(fscal,dz00);
307
308             /* Update vectorial force */
309             fix0             = _mm_add_ps(fix0,tx);
310             fiy0             = _mm_add_ps(fiy0,ty);
311             fiz0             = _mm_add_ps(fiz0,tz);
312
313             fjx0             = _mm_add_ps(fjx0,tx);
314             fjy0             = _mm_add_ps(fjy0,ty);
315             fjz0             = _mm_add_ps(fjz0,tz);
316
317             /**************************
318              * CALCULATE INTERACTIONS *
319              **************************/
320
321             /* REACTION-FIELD ELECTROSTATICS */
322             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
323             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
324
325             /* Update potential sum for this i atom from the interaction with this j atom. */
326             velecsum         = _mm_add_ps(velecsum,velec);
327
328             fscal            = felec;
329
330             /* Calculate temporary vectorial force */
331             tx               = _mm_mul_ps(fscal,dx01);
332             ty               = _mm_mul_ps(fscal,dy01);
333             tz               = _mm_mul_ps(fscal,dz01);
334
335             /* Update vectorial force */
336             fix0             = _mm_add_ps(fix0,tx);
337             fiy0             = _mm_add_ps(fiy0,ty);
338             fiz0             = _mm_add_ps(fiz0,tz);
339
340             fjx1             = _mm_add_ps(fjx1,tx);
341             fjy1             = _mm_add_ps(fjy1,ty);
342             fjz1             = _mm_add_ps(fjz1,tz);
343
344             /**************************
345              * CALCULATE INTERACTIONS *
346              **************************/
347
348             /* REACTION-FIELD ELECTROSTATICS */
349             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
350             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             velecsum         = _mm_add_ps(velecsum,velec);
354
355             fscal            = felec;
356
357             /* Calculate temporary vectorial force */
358             tx               = _mm_mul_ps(fscal,dx02);
359             ty               = _mm_mul_ps(fscal,dy02);
360             tz               = _mm_mul_ps(fscal,dz02);
361
362             /* Update vectorial force */
363             fix0             = _mm_add_ps(fix0,tx);
364             fiy0             = _mm_add_ps(fiy0,ty);
365             fiz0             = _mm_add_ps(fiz0,tz);
366
367             fjx2             = _mm_add_ps(fjx2,tx);
368             fjy2             = _mm_add_ps(fjy2,ty);
369             fjz2             = _mm_add_ps(fjz2,tz);
370
371             /**************************
372              * CALCULATE INTERACTIONS *
373              **************************/
374
375             /* REACTION-FIELD ELECTROSTATICS */
376             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
377             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
378
379             /* Update potential sum for this i atom from the interaction with this j atom. */
380             velecsum         = _mm_add_ps(velecsum,velec);
381
382             fscal            = felec;
383
384             /* Calculate temporary vectorial force */
385             tx               = _mm_mul_ps(fscal,dx10);
386             ty               = _mm_mul_ps(fscal,dy10);
387             tz               = _mm_mul_ps(fscal,dz10);
388
389             /* Update vectorial force */
390             fix1             = _mm_add_ps(fix1,tx);
391             fiy1             = _mm_add_ps(fiy1,ty);
392             fiz1             = _mm_add_ps(fiz1,tz);
393
394             fjx0             = _mm_add_ps(fjx0,tx);
395             fjy0             = _mm_add_ps(fjy0,ty);
396             fjz0             = _mm_add_ps(fjz0,tz);
397
398             /**************************
399              * CALCULATE INTERACTIONS *
400              **************************/
401
402             /* REACTION-FIELD ELECTROSTATICS */
403             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
404             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
405
406             /* Update potential sum for this i atom from the interaction with this j atom. */
407             velecsum         = _mm_add_ps(velecsum,velec);
408
409             fscal            = felec;
410
411             /* Calculate temporary vectorial force */
412             tx               = _mm_mul_ps(fscal,dx11);
413             ty               = _mm_mul_ps(fscal,dy11);
414             tz               = _mm_mul_ps(fscal,dz11);
415
416             /* Update vectorial force */
417             fix1             = _mm_add_ps(fix1,tx);
418             fiy1             = _mm_add_ps(fiy1,ty);
419             fiz1             = _mm_add_ps(fiz1,tz);
420
421             fjx1             = _mm_add_ps(fjx1,tx);
422             fjy1             = _mm_add_ps(fjy1,ty);
423             fjz1             = _mm_add_ps(fjz1,tz);
424
425             /**************************
426              * CALCULATE INTERACTIONS *
427              **************************/
428
429             /* REACTION-FIELD ELECTROSTATICS */
430             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
431             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
432
433             /* Update potential sum for this i atom from the interaction with this j atom. */
434             velecsum         = _mm_add_ps(velecsum,velec);
435
436             fscal            = felec;
437
438             /* Calculate temporary vectorial force */
439             tx               = _mm_mul_ps(fscal,dx12);
440             ty               = _mm_mul_ps(fscal,dy12);
441             tz               = _mm_mul_ps(fscal,dz12);
442
443             /* Update vectorial force */
444             fix1             = _mm_add_ps(fix1,tx);
445             fiy1             = _mm_add_ps(fiy1,ty);
446             fiz1             = _mm_add_ps(fiz1,tz);
447
448             fjx2             = _mm_add_ps(fjx2,tx);
449             fjy2             = _mm_add_ps(fjy2,ty);
450             fjz2             = _mm_add_ps(fjz2,tz);
451
452             /**************************
453              * CALCULATE INTERACTIONS *
454              **************************/
455
456             /* REACTION-FIELD ELECTROSTATICS */
457             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
458             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
459
460             /* Update potential sum for this i atom from the interaction with this j atom. */
461             velecsum         = _mm_add_ps(velecsum,velec);
462
463             fscal            = felec;
464
465             /* Calculate temporary vectorial force */
466             tx               = _mm_mul_ps(fscal,dx20);
467             ty               = _mm_mul_ps(fscal,dy20);
468             tz               = _mm_mul_ps(fscal,dz20);
469
470             /* Update vectorial force */
471             fix2             = _mm_add_ps(fix2,tx);
472             fiy2             = _mm_add_ps(fiy2,ty);
473             fiz2             = _mm_add_ps(fiz2,tz);
474
475             fjx0             = _mm_add_ps(fjx0,tx);
476             fjy0             = _mm_add_ps(fjy0,ty);
477             fjz0             = _mm_add_ps(fjz0,tz);
478
479             /**************************
480              * CALCULATE INTERACTIONS *
481              **************************/
482
483             /* REACTION-FIELD ELECTROSTATICS */
484             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
485             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
486
487             /* Update potential sum for this i atom from the interaction with this j atom. */
488             velecsum         = _mm_add_ps(velecsum,velec);
489
490             fscal            = felec;
491
492             /* Calculate temporary vectorial force */
493             tx               = _mm_mul_ps(fscal,dx21);
494             ty               = _mm_mul_ps(fscal,dy21);
495             tz               = _mm_mul_ps(fscal,dz21);
496
497             /* Update vectorial force */
498             fix2             = _mm_add_ps(fix2,tx);
499             fiy2             = _mm_add_ps(fiy2,ty);
500             fiz2             = _mm_add_ps(fiz2,tz);
501
502             fjx1             = _mm_add_ps(fjx1,tx);
503             fjy1             = _mm_add_ps(fjy1,ty);
504             fjz1             = _mm_add_ps(fjz1,tz);
505
506             /**************************
507              * CALCULATE INTERACTIONS *
508              **************************/
509
510             /* REACTION-FIELD ELECTROSTATICS */
511             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
512             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
513
514             /* Update potential sum for this i atom from the interaction with this j atom. */
515             velecsum         = _mm_add_ps(velecsum,velec);
516
517             fscal            = felec;
518
519             /* Calculate temporary vectorial force */
520             tx               = _mm_mul_ps(fscal,dx22);
521             ty               = _mm_mul_ps(fscal,dy22);
522             tz               = _mm_mul_ps(fscal,dz22);
523
524             /* Update vectorial force */
525             fix2             = _mm_add_ps(fix2,tx);
526             fiy2             = _mm_add_ps(fiy2,ty);
527             fiz2             = _mm_add_ps(fiz2,tz);
528
529             fjx2             = _mm_add_ps(fjx2,tx);
530             fjy2             = _mm_add_ps(fjy2,ty);
531             fjz2             = _mm_add_ps(fjz2,tz);
532
533             fjptrA             = f+j_coord_offsetA;
534             fjptrB             = f+j_coord_offsetB;
535             fjptrC             = f+j_coord_offsetC;
536             fjptrD             = f+j_coord_offsetD;
537
538             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
539                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
540
541             /* Inner loop uses 300 flops */
542         }
543
544         if(jidx<j_index_end)
545         {
546
547             /* Get j neighbor index, and coordinate index */
548             jnrlistA         = jjnr[jidx];
549             jnrlistB         = jjnr[jidx+1];
550             jnrlistC         = jjnr[jidx+2];
551             jnrlistD         = jjnr[jidx+3];
552             /* Sign of each element will be negative for non-real atoms.
553              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
554              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
555              */
556             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
557             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
558             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
559             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
560             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
561             j_coord_offsetA  = DIM*jnrA;
562             j_coord_offsetB  = DIM*jnrB;
563             j_coord_offsetC  = DIM*jnrC;
564             j_coord_offsetD  = DIM*jnrD;
565
566             /* load j atom coordinates */
567             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
568                                               x+j_coord_offsetC,x+j_coord_offsetD,
569                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
570
571             /* Calculate displacement vector */
572             dx00             = _mm_sub_ps(ix0,jx0);
573             dy00             = _mm_sub_ps(iy0,jy0);
574             dz00             = _mm_sub_ps(iz0,jz0);
575             dx01             = _mm_sub_ps(ix0,jx1);
576             dy01             = _mm_sub_ps(iy0,jy1);
577             dz01             = _mm_sub_ps(iz0,jz1);
578             dx02             = _mm_sub_ps(ix0,jx2);
579             dy02             = _mm_sub_ps(iy0,jy2);
580             dz02             = _mm_sub_ps(iz0,jz2);
581             dx10             = _mm_sub_ps(ix1,jx0);
582             dy10             = _mm_sub_ps(iy1,jy0);
583             dz10             = _mm_sub_ps(iz1,jz0);
584             dx11             = _mm_sub_ps(ix1,jx1);
585             dy11             = _mm_sub_ps(iy1,jy1);
586             dz11             = _mm_sub_ps(iz1,jz1);
587             dx12             = _mm_sub_ps(ix1,jx2);
588             dy12             = _mm_sub_ps(iy1,jy2);
589             dz12             = _mm_sub_ps(iz1,jz2);
590             dx20             = _mm_sub_ps(ix2,jx0);
591             dy20             = _mm_sub_ps(iy2,jy0);
592             dz20             = _mm_sub_ps(iz2,jz0);
593             dx21             = _mm_sub_ps(ix2,jx1);
594             dy21             = _mm_sub_ps(iy2,jy1);
595             dz21             = _mm_sub_ps(iz2,jz1);
596             dx22             = _mm_sub_ps(ix2,jx2);
597             dy22             = _mm_sub_ps(iy2,jy2);
598             dz22             = _mm_sub_ps(iz2,jz2);
599
600             /* Calculate squared distance and things based on it */
601             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
602             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
603             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
604             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
605             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
606             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
607             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
608             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
609             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
610
611             rinv00           = gmx_mm_invsqrt_ps(rsq00);
612             rinv01           = gmx_mm_invsqrt_ps(rsq01);
613             rinv02           = gmx_mm_invsqrt_ps(rsq02);
614             rinv10           = gmx_mm_invsqrt_ps(rsq10);
615             rinv11           = gmx_mm_invsqrt_ps(rsq11);
616             rinv12           = gmx_mm_invsqrt_ps(rsq12);
617             rinv20           = gmx_mm_invsqrt_ps(rsq20);
618             rinv21           = gmx_mm_invsqrt_ps(rsq21);
619             rinv22           = gmx_mm_invsqrt_ps(rsq22);
620
621             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
622             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
623             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
624             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
625             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
626             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
627             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
628             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
629             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
630
631             fjx0             = _mm_setzero_ps();
632             fjy0             = _mm_setzero_ps();
633             fjz0             = _mm_setzero_ps();
634             fjx1             = _mm_setzero_ps();
635             fjy1             = _mm_setzero_ps();
636             fjz1             = _mm_setzero_ps();
637             fjx2             = _mm_setzero_ps();
638             fjy2             = _mm_setzero_ps();
639             fjz2             = _mm_setzero_ps();
640
641             /**************************
642              * CALCULATE INTERACTIONS *
643              **************************/
644
645             /* REACTION-FIELD ELECTROSTATICS */
646             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
647             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
648
649             /* LENNARD-JONES DISPERSION/REPULSION */
650
651             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
652             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
653             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
654             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
655             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
656
657             /* Update potential sum for this i atom from the interaction with this j atom. */
658             velec            = _mm_andnot_ps(dummy_mask,velec);
659             velecsum         = _mm_add_ps(velecsum,velec);
660             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
661             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
662
663             fscal            = _mm_add_ps(felec,fvdw);
664
665             fscal            = _mm_andnot_ps(dummy_mask,fscal);
666
667             /* Calculate temporary vectorial force */
668             tx               = _mm_mul_ps(fscal,dx00);
669             ty               = _mm_mul_ps(fscal,dy00);
670             tz               = _mm_mul_ps(fscal,dz00);
671
672             /* Update vectorial force */
673             fix0             = _mm_add_ps(fix0,tx);
674             fiy0             = _mm_add_ps(fiy0,ty);
675             fiz0             = _mm_add_ps(fiz0,tz);
676
677             fjx0             = _mm_add_ps(fjx0,tx);
678             fjy0             = _mm_add_ps(fjy0,ty);
679             fjz0             = _mm_add_ps(fjz0,tz);
680
681             /**************************
682              * CALCULATE INTERACTIONS *
683              **************************/
684
685             /* REACTION-FIELD ELECTROSTATICS */
686             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
687             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
688
689             /* Update potential sum for this i atom from the interaction with this j atom. */
690             velec            = _mm_andnot_ps(dummy_mask,velec);
691             velecsum         = _mm_add_ps(velecsum,velec);
692
693             fscal            = felec;
694
695             fscal            = _mm_andnot_ps(dummy_mask,fscal);
696
697             /* Calculate temporary vectorial force */
698             tx               = _mm_mul_ps(fscal,dx01);
699             ty               = _mm_mul_ps(fscal,dy01);
700             tz               = _mm_mul_ps(fscal,dz01);
701
702             /* Update vectorial force */
703             fix0             = _mm_add_ps(fix0,tx);
704             fiy0             = _mm_add_ps(fiy0,ty);
705             fiz0             = _mm_add_ps(fiz0,tz);
706
707             fjx1             = _mm_add_ps(fjx1,tx);
708             fjy1             = _mm_add_ps(fjy1,ty);
709             fjz1             = _mm_add_ps(fjz1,tz);
710
711             /**************************
712              * CALCULATE INTERACTIONS *
713              **************************/
714
715             /* REACTION-FIELD ELECTROSTATICS */
716             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
717             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
718
719             /* Update potential sum for this i atom from the interaction with this j atom. */
720             velec            = _mm_andnot_ps(dummy_mask,velec);
721             velecsum         = _mm_add_ps(velecsum,velec);
722
723             fscal            = felec;
724
725             fscal            = _mm_andnot_ps(dummy_mask,fscal);
726
727             /* Calculate temporary vectorial force */
728             tx               = _mm_mul_ps(fscal,dx02);
729             ty               = _mm_mul_ps(fscal,dy02);
730             tz               = _mm_mul_ps(fscal,dz02);
731
732             /* Update vectorial force */
733             fix0             = _mm_add_ps(fix0,tx);
734             fiy0             = _mm_add_ps(fiy0,ty);
735             fiz0             = _mm_add_ps(fiz0,tz);
736
737             fjx2             = _mm_add_ps(fjx2,tx);
738             fjy2             = _mm_add_ps(fjy2,ty);
739             fjz2             = _mm_add_ps(fjz2,tz);
740
741             /**************************
742              * CALCULATE INTERACTIONS *
743              **************************/
744
745             /* REACTION-FIELD ELECTROSTATICS */
746             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
747             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
748
749             /* Update potential sum for this i atom from the interaction with this j atom. */
750             velec            = _mm_andnot_ps(dummy_mask,velec);
751             velecsum         = _mm_add_ps(velecsum,velec);
752
753             fscal            = felec;
754
755             fscal            = _mm_andnot_ps(dummy_mask,fscal);
756
757             /* Calculate temporary vectorial force */
758             tx               = _mm_mul_ps(fscal,dx10);
759             ty               = _mm_mul_ps(fscal,dy10);
760             tz               = _mm_mul_ps(fscal,dz10);
761
762             /* Update vectorial force */
763             fix1             = _mm_add_ps(fix1,tx);
764             fiy1             = _mm_add_ps(fiy1,ty);
765             fiz1             = _mm_add_ps(fiz1,tz);
766
767             fjx0             = _mm_add_ps(fjx0,tx);
768             fjy0             = _mm_add_ps(fjy0,ty);
769             fjz0             = _mm_add_ps(fjz0,tz);
770
771             /**************************
772              * CALCULATE INTERACTIONS *
773              **************************/
774
775             /* REACTION-FIELD ELECTROSTATICS */
776             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
777             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
778
779             /* Update potential sum for this i atom from the interaction with this j atom. */
780             velec            = _mm_andnot_ps(dummy_mask,velec);
781             velecsum         = _mm_add_ps(velecsum,velec);
782
783             fscal            = felec;
784
785             fscal            = _mm_andnot_ps(dummy_mask,fscal);
786
787             /* Calculate temporary vectorial force */
788             tx               = _mm_mul_ps(fscal,dx11);
789             ty               = _mm_mul_ps(fscal,dy11);
790             tz               = _mm_mul_ps(fscal,dz11);
791
792             /* Update vectorial force */
793             fix1             = _mm_add_ps(fix1,tx);
794             fiy1             = _mm_add_ps(fiy1,ty);
795             fiz1             = _mm_add_ps(fiz1,tz);
796
797             fjx1             = _mm_add_ps(fjx1,tx);
798             fjy1             = _mm_add_ps(fjy1,ty);
799             fjz1             = _mm_add_ps(fjz1,tz);
800
801             /**************************
802              * CALCULATE INTERACTIONS *
803              **************************/
804
805             /* REACTION-FIELD ELECTROSTATICS */
806             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
807             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
808
809             /* Update potential sum for this i atom from the interaction with this j atom. */
810             velec            = _mm_andnot_ps(dummy_mask,velec);
811             velecsum         = _mm_add_ps(velecsum,velec);
812
813             fscal            = felec;
814
815             fscal            = _mm_andnot_ps(dummy_mask,fscal);
816
817             /* Calculate temporary vectorial force */
818             tx               = _mm_mul_ps(fscal,dx12);
819             ty               = _mm_mul_ps(fscal,dy12);
820             tz               = _mm_mul_ps(fscal,dz12);
821
822             /* Update vectorial force */
823             fix1             = _mm_add_ps(fix1,tx);
824             fiy1             = _mm_add_ps(fiy1,ty);
825             fiz1             = _mm_add_ps(fiz1,tz);
826
827             fjx2             = _mm_add_ps(fjx2,tx);
828             fjy2             = _mm_add_ps(fjy2,ty);
829             fjz2             = _mm_add_ps(fjz2,tz);
830
831             /**************************
832              * CALCULATE INTERACTIONS *
833              **************************/
834
835             /* REACTION-FIELD ELECTROSTATICS */
836             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
837             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
838
839             /* Update potential sum for this i atom from the interaction with this j atom. */
840             velec            = _mm_andnot_ps(dummy_mask,velec);
841             velecsum         = _mm_add_ps(velecsum,velec);
842
843             fscal            = felec;
844
845             fscal            = _mm_andnot_ps(dummy_mask,fscal);
846
847             /* Calculate temporary vectorial force */
848             tx               = _mm_mul_ps(fscal,dx20);
849             ty               = _mm_mul_ps(fscal,dy20);
850             tz               = _mm_mul_ps(fscal,dz20);
851
852             /* Update vectorial force */
853             fix2             = _mm_add_ps(fix2,tx);
854             fiy2             = _mm_add_ps(fiy2,ty);
855             fiz2             = _mm_add_ps(fiz2,tz);
856
857             fjx0             = _mm_add_ps(fjx0,tx);
858             fjy0             = _mm_add_ps(fjy0,ty);
859             fjz0             = _mm_add_ps(fjz0,tz);
860
861             /**************************
862              * CALCULATE INTERACTIONS *
863              **************************/
864
865             /* REACTION-FIELD ELECTROSTATICS */
866             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
867             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
868
869             /* Update potential sum for this i atom from the interaction with this j atom. */
870             velec            = _mm_andnot_ps(dummy_mask,velec);
871             velecsum         = _mm_add_ps(velecsum,velec);
872
873             fscal            = felec;
874
875             fscal            = _mm_andnot_ps(dummy_mask,fscal);
876
877             /* Calculate temporary vectorial force */
878             tx               = _mm_mul_ps(fscal,dx21);
879             ty               = _mm_mul_ps(fscal,dy21);
880             tz               = _mm_mul_ps(fscal,dz21);
881
882             /* Update vectorial force */
883             fix2             = _mm_add_ps(fix2,tx);
884             fiy2             = _mm_add_ps(fiy2,ty);
885             fiz2             = _mm_add_ps(fiz2,tz);
886
887             fjx1             = _mm_add_ps(fjx1,tx);
888             fjy1             = _mm_add_ps(fjy1,ty);
889             fjz1             = _mm_add_ps(fjz1,tz);
890
891             /**************************
892              * CALCULATE INTERACTIONS *
893              **************************/
894
895             /* REACTION-FIELD ELECTROSTATICS */
896             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
897             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
898
899             /* Update potential sum for this i atom from the interaction with this j atom. */
900             velec            = _mm_andnot_ps(dummy_mask,velec);
901             velecsum         = _mm_add_ps(velecsum,velec);
902
903             fscal            = felec;
904
905             fscal            = _mm_andnot_ps(dummy_mask,fscal);
906
907             /* Calculate temporary vectorial force */
908             tx               = _mm_mul_ps(fscal,dx22);
909             ty               = _mm_mul_ps(fscal,dy22);
910             tz               = _mm_mul_ps(fscal,dz22);
911
912             /* Update vectorial force */
913             fix2             = _mm_add_ps(fix2,tx);
914             fiy2             = _mm_add_ps(fiy2,ty);
915             fiz2             = _mm_add_ps(fiz2,tz);
916
917             fjx2             = _mm_add_ps(fjx2,tx);
918             fjy2             = _mm_add_ps(fjy2,ty);
919             fjz2             = _mm_add_ps(fjz2,tz);
920
921             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
922             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
923             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
924             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
925
926             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
927                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
928
929             /* Inner loop uses 300 flops */
930         }
931
932         /* End of innermost loop */
933
934         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
935                                               f+i_coord_offset,fshift+i_shift_offset);
936
937         ggid                        = gid[iidx];
938         /* Update potential energies */
939         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
940         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
941
942         /* Increment number of inner iterations */
943         inneriter                  += j_index_end - j_index_start;
944
945         /* Outer loop uses 20 flops */
946     }
947
948     /* Increment number of outer iterations */
949     outeriter        += nri;
950
951     /* Update outer/inner flops */
952
953     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*300);
954 }
955 /*
956  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single
957  * Electrostatics interaction: ReactionField
958  * VdW interaction:            LennardJones
959  * Geometry:                   Water3-Water3
960  * Calculate force/pot:        Force
961  */
962 void
963 nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single
964                     (t_nblist * gmx_restrict                nlist,
965                      rvec * gmx_restrict                    xx,
966                      rvec * gmx_restrict                    ff,
967                      t_forcerec * gmx_restrict              fr,
968                      t_mdatoms * gmx_restrict               mdatoms,
969                      nb_kernel_data_t * gmx_restrict        kernel_data,
970                      t_nrnb * gmx_restrict                  nrnb)
971 {
972     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
973      * just 0 for non-waters.
974      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
975      * jnr indices corresponding to data put in the four positions in the SIMD register.
976      */
977     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
978     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
979     int              jnrA,jnrB,jnrC,jnrD;
980     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
981     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
982     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
983     real             rcutoff_scalar;
984     real             *shiftvec,*fshift,*x,*f;
985     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
986     real             scratch[4*DIM];
987     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
988     int              vdwioffset0;
989     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
990     int              vdwioffset1;
991     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
992     int              vdwioffset2;
993     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
994     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
995     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
996     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
997     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
998     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
999     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1000     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1001     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1002     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1003     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1004     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1005     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1006     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1007     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1008     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1009     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1010     real             *charge;
1011     int              nvdwtype;
1012     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1013     int              *vdwtype;
1014     real             *vdwparam;
1015     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1016     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1017     __m128           dummy_mask,cutoff_mask;
1018     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1019     __m128           one     = _mm_set1_ps(1.0);
1020     __m128           two     = _mm_set1_ps(2.0);
1021     x                = xx[0];
1022     f                = ff[0];
1023
1024     nri              = nlist->nri;
1025     iinr             = nlist->iinr;
1026     jindex           = nlist->jindex;
1027     jjnr             = nlist->jjnr;
1028     shiftidx         = nlist->shift;
1029     gid              = nlist->gid;
1030     shiftvec         = fr->shift_vec[0];
1031     fshift           = fr->fshift[0];
1032     facel            = _mm_set1_ps(fr->epsfac);
1033     charge           = mdatoms->chargeA;
1034     krf              = _mm_set1_ps(fr->ic->k_rf);
1035     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1036     crf              = _mm_set1_ps(fr->ic->c_rf);
1037     nvdwtype         = fr->ntype;
1038     vdwparam         = fr->nbfp;
1039     vdwtype          = mdatoms->typeA;
1040
1041     /* Setup water-specific parameters */
1042     inr              = nlist->iinr[0];
1043     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1044     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1045     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1046     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1047
1048     jq0              = _mm_set1_ps(charge[inr+0]);
1049     jq1              = _mm_set1_ps(charge[inr+1]);
1050     jq2              = _mm_set1_ps(charge[inr+2]);
1051     vdwjidx0A        = 2*vdwtype[inr+0];
1052     qq00             = _mm_mul_ps(iq0,jq0);
1053     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1054     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1055     qq01             = _mm_mul_ps(iq0,jq1);
1056     qq02             = _mm_mul_ps(iq0,jq2);
1057     qq10             = _mm_mul_ps(iq1,jq0);
1058     qq11             = _mm_mul_ps(iq1,jq1);
1059     qq12             = _mm_mul_ps(iq1,jq2);
1060     qq20             = _mm_mul_ps(iq2,jq0);
1061     qq21             = _mm_mul_ps(iq2,jq1);
1062     qq22             = _mm_mul_ps(iq2,jq2);
1063
1064     /* Avoid stupid compiler warnings */
1065     jnrA = jnrB = jnrC = jnrD = 0;
1066     j_coord_offsetA = 0;
1067     j_coord_offsetB = 0;
1068     j_coord_offsetC = 0;
1069     j_coord_offsetD = 0;
1070
1071     outeriter        = 0;
1072     inneriter        = 0;
1073
1074     for(iidx=0;iidx<4*DIM;iidx++)
1075     {
1076         scratch[iidx] = 0.0;
1077     }
1078
1079     /* Start outer loop over neighborlists */
1080     for(iidx=0; iidx<nri; iidx++)
1081     {
1082         /* Load shift vector for this list */
1083         i_shift_offset   = DIM*shiftidx[iidx];
1084
1085         /* Load limits for loop over neighbors */
1086         j_index_start    = jindex[iidx];
1087         j_index_end      = jindex[iidx+1];
1088
1089         /* Get outer coordinate index */
1090         inr              = iinr[iidx];
1091         i_coord_offset   = DIM*inr;
1092
1093         /* Load i particle coords and add shift vector */
1094         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1095                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1096
1097         fix0             = _mm_setzero_ps();
1098         fiy0             = _mm_setzero_ps();
1099         fiz0             = _mm_setzero_ps();
1100         fix1             = _mm_setzero_ps();
1101         fiy1             = _mm_setzero_ps();
1102         fiz1             = _mm_setzero_ps();
1103         fix2             = _mm_setzero_ps();
1104         fiy2             = _mm_setzero_ps();
1105         fiz2             = _mm_setzero_ps();
1106
1107         /* Start inner kernel loop */
1108         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1109         {
1110
1111             /* Get j neighbor index, and coordinate index */
1112             jnrA             = jjnr[jidx];
1113             jnrB             = jjnr[jidx+1];
1114             jnrC             = jjnr[jidx+2];
1115             jnrD             = jjnr[jidx+3];
1116             j_coord_offsetA  = DIM*jnrA;
1117             j_coord_offsetB  = DIM*jnrB;
1118             j_coord_offsetC  = DIM*jnrC;
1119             j_coord_offsetD  = DIM*jnrD;
1120
1121             /* load j atom coordinates */
1122             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1123                                               x+j_coord_offsetC,x+j_coord_offsetD,
1124                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1125
1126             /* Calculate displacement vector */
1127             dx00             = _mm_sub_ps(ix0,jx0);
1128             dy00             = _mm_sub_ps(iy0,jy0);
1129             dz00             = _mm_sub_ps(iz0,jz0);
1130             dx01             = _mm_sub_ps(ix0,jx1);
1131             dy01             = _mm_sub_ps(iy0,jy1);
1132             dz01             = _mm_sub_ps(iz0,jz1);
1133             dx02             = _mm_sub_ps(ix0,jx2);
1134             dy02             = _mm_sub_ps(iy0,jy2);
1135             dz02             = _mm_sub_ps(iz0,jz2);
1136             dx10             = _mm_sub_ps(ix1,jx0);
1137             dy10             = _mm_sub_ps(iy1,jy0);
1138             dz10             = _mm_sub_ps(iz1,jz0);
1139             dx11             = _mm_sub_ps(ix1,jx1);
1140             dy11             = _mm_sub_ps(iy1,jy1);
1141             dz11             = _mm_sub_ps(iz1,jz1);
1142             dx12             = _mm_sub_ps(ix1,jx2);
1143             dy12             = _mm_sub_ps(iy1,jy2);
1144             dz12             = _mm_sub_ps(iz1,jz2);
1145             dx20             = _mm_sub_ps(ix2,jx0);
1146             dy20             = _mm_sub_ps(iy2,jy0);
1147             dz20             = _mm_sub_ps(iz2,jz0);
1148             dx21             = _mm_sub_ps(ix2,jx1);
1149             dy21             = _mm_sub_ps(iy2,jy1);
1150             dz21             = _mm_sub_ps(iz2,jz1);
1151             dx22             = _mm_sub_ps(ix2,jx2);
1152             dy22             = _mm_sub_ps(iy2,jy2);
1153             dz22             = _mm_sub_ps(iz2,jz2);
1154
1155             /* Calculate squared distance and things based on it */
1156             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1157             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1158             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1159             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1160             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1161             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1162             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1163             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1164             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1165
1166             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1167             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1168             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1169             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1170             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1171             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1172             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1173             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1174             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1175
1176             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1177             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1178             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1179             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1180             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1181             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1182             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1183             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1184             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1185
1186             fjx0             = _mm_setzero_ps();
1187             fjy0             = _mm_setzero_ps();
1188             fjz0             = _mm_setzero_ps();
1189             fjx1             = _mm_setzero_ps();
1190             fjy1             = _mm_setzero_ps();
1191             fjz1             = _mm_setzero_ps();
1192             fjx2             = _mm_setzero_ps();
1193             fjy2             = _mm_setzero_ps();
1194             fjz2             = _mm_setzero_ps();
1195
1196             /**************************
1197              * CALCULATE INTERACTIONS *
1198              **************************/
1199
1200             /* REACTION-FIELD ELECTROSTATICS */
1201             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1202
1203             /* LENNARD-JONES DISPERSION/REPULSION */
1204
1205             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1206             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1207
1208             fscal            = _mm_add_ps(felec,fvdw);
1209
1210             /* Calculate temporary vectorial force */
1211             tx               = _mm_mul_ps(fscal,dx00);
1212             ty               = _mm_mul_ps(fscal,dy00);
1213             tz               = _mm_mul_ps(fscal,dz00);
1214
1215             /* Update vectorial force */
1216             fix0             = _mm_add_ps(fix0,tx);
1217             fiy0             = _mm_add_ps(fiy0,ty);
1218             fiz0             = _mm_add_ps(fiz0,tz);
1219
1220             fjx0             = _mm_add_ps(fjx0,tx);
1221             fjy0             = _mm_add_ps(fjy0,ty);
1222             fjz0             = _mm_add_ps(fjz0,tz);
1223
1224             /**************************
1225              * CALCULATE INTERACTIONS *
1226              **************************/
1227
1228             /* REACTION-FIELD ELECTROSTATICS */
1229             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1230
1231             fscal            = felec;
1232
1233             /* Calculate temporary vectorial force */
1234             tx               = _mm_mul_ps(fscal,dx01);
1235             ty               = _mm_mul_ps(fscal,dy01);
1236             tz               = _mm_mul_ps(fscal,dz01);
1237
1238             /* Update vectorial force */
1239             fix0             = _mm_add_ps(fix0,tx);
1240             fiy0             = _mm_add_ps(fiy0,ty);
1241             fiz0             = _mm_add_ps(fiz0,tz);
1242
1243             fjx1             = _mm_add_ps(fjx1,tx);
1244             fjy1             = _mm_add_ps(fjy1,ty);
1245             fjz1             = _mm_add_ps(fjz1,tz);
1246
1247             /**************************
1248              * CALCULATE INTERACTIONS *
1249              **************************/
1250
1251             /* REACTION-FIELD ELECTROSTATICS */
1252             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1253
1254             fscal            = felec;
1255
1256             /* Calculate temporary vectorial force */
1257             tx               = _mm_mul_ps(fscal,dx02);
1258             ty               = _mm_mul_ps(fscal,dy02);
1259             tz               = _mm_mul_ps(fscal,dz02);
1260
1261             /* Update vectorial force */
1262             fix0             = _mm_add_ps(fix0,tx);
1263             fiy0             = _mm_add_ps(fiy0,ty);
1264             fiz0             = _mm_add_ps(fiz0,tz);
1265
1266             fjx2             = _mm_add_ps(fjx2,tx);
1267             fjy2             = _mm_add_ps(fjy2,ty);
1268             fjz2             = _mm_add_ps(fjz2,tz);
1269
1270             /**************************
1271              * CALCULATE INTERACTIONS *
1272              **************************/
1273
1274             /* REACTION-FIELD ELECTROSTATICS */
1275             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1276
1277             fscal            = felec;
1278
1279             /* Calculate temporary vectorial force */
1280             tx               = _mm_mul_ps(fscal,dx10);
1281             ty               = _mm_mul_ps(fscal,dy10);
1282             tz               = _mm_mul_ps(fscal,dz10);
1283
1284             /* Update vectorial force */
1285             fix1             = _mm_add_ps(fix1,tx);
1286             fiy1             = _mm_add_ps(fiy1,ty);
1287             fiz1             = _mm_add_ps(fiz1,tz);
1288
1289             fjx0             = _mm_add_ps(fjx0,tx);
1290             fjy0             = _mm_add_ps(fjy0,ty);
1291             fjz0             = _mm_add_ps(fjz0,tz);
1292
1293             /**************************
1294              * CALCULATE INTERACTIONS *
1295              **************************/
1296
1297             /* REACTION-FIELD ELECTROSTATICS */
1298             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1299
1300             fscal            = felec;
1301
1302             /* Calculate temporary vectorial force */
1303             tx               = _mm_mul_ps(fscal,dx11);
1304             ty               = _mm_mul_ps(fscal,dy11);
1305             tz               = _mm_mul_ps(fscal,dz11);
1306
1307             /* Update vectorial force */
1308             fix1             = _mm_add_ps(fix1,tx);
1309             fiy1             = _mm_add_ps(fiy1,ty);
1310             fiz1             = _mm_add_ps(fiz1,tz);
1311
1312             fjx1             = _mm_add_ps(fjx1,tx);
1313             fjy1             = _mm_add_ps(fjy1,ty);
1314             fjz1             = _mm_add_ps(fjz1,tz);
1315
1316             /**************************
1317              * CALCULATE INTERACTIONS *
1318              **************************/
1319
1320             /* REACTION-FIELD ELECTROSTATICS */
1321             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1322
1323             fscal            = felec;
1324
1325             /* Calculate temporary vectorial force */
1326             tx               = _mm_mul_ps(fscal,dx12);
1327             ty               = _mm_mul_ps(fscal,dy12);
1328             tz               = _mm_mul_ps(fscal,dz12);
1329
1330             /* Update vectorial force */
1331             fix1             = _mm_add_ps(fix1,tx);
1332             fiy1             = _mm_add_ps(fiy1,ty);
1333             fiz1             = _mm_add_ps(fiz1,tz);
1334
1335             fjx2             = _mm_add_ps(fjx2,tx);
1336             fjy2             = _mm_add_ps(fjy2,ty);
1337             fjz2             = _mm_add_ps(fjz2,tz);
1338
1339             /**************************
1340              * CALCULATE INTERACTIONS *
1341              **************************/
1342
1343             /* REACTION-FIELD ELECTROSTATICS */
1344             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1345
1346             fscal            = felec;
1347
1348             /* Calculate temporary vectorial force */
1349             tx               = _mm_mul_ps(fscal,dx20);
1350             ty               = _mm_mul_ps(fscal,dy20);
1351             tz               = _mm_mul_ps(fscal,dz20);
1352
1353             /* Update vectorial force */
1354             fix2             = _mm_add_ps(fix2,tx);
1355             fiy2             = _mm_add_ps(fiy2,ty);
1356             fiz2             = _mm_add_ps(fiz2,tz);
1357
1358             fjx0             = _mm_add_ps(fjx0,tx);
1359             fjy0             = _mm_add_ps(fjy0,ty);
1360             fjz0             = _mm_add_ps(fjz0,tz);
1361
1362             /**************************
1363              * CALCULATE INTERACTIONS *
1364              **************************/
1365
1366             /* REACTION-FIELD ELECTROSTATICS */
1367             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1368
1369             fscal            = felec;
1370
1371             /* Calculate temporary vectorial force */
1372             tx               = _mm_mul_ps(fscal,dx21);
1373             ty               = _mm_mul_ps(fscal,dy21);
1374             tz               = _mm_mul_ps(fscal,dz21);
1375
1376             /* Update vectorial force */
1377             fix2             = _mm_add_ps(fix2,tx);
1378             fiy2             = _mm_add_ps(fiy2,ty);
1379             fiz2             = _mm_add_ps(fiz2,tz);
1380
1381             fjx1             = _mm_add_ps(fjx1,tx);
1382             fjy1             = _mm_add_ps(fjy1,ty);
1383             fjz1             = _mm_add_ps(fjz1,tz);
1384
1385             /**************************
1386              * CALCULATE INTERACTIONS *
1387              **************************/
1388
1389             /* REACTION-FIELD ELECTROSTATICS */
1390             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1391
1392             fscal            = felec;
1393
1394             /* Calculate temporary vectorial force */
1395             tx               = _mm_mul_ps(fscal,dx22);
1396             ty               = _mm_mul_ps(fscal,dy22);
1397             tz               = _mm_mul_ps(fscal,dz22);
1398
1399             /* Update vectorial force */
1400             fix2             = _mm_add_ps(fix2,tx);
1401             fiy2             = _mm_add_ps(fiy2,ty);
1402             fiz2             = _mm_add_ps(fiz2,tz);
1403
1404             fjx2             = _mm_add_ps(fjx2,tx);
1405             fjy2             = _mm_add_ps(fjy2,ty);
1406             fjz2             = _mm_add_ps(fjz2,tz);
1407
1408             fjptrA             = f+j_coord_offsetA;
1409             fjptrB             = f+j_coord_offsetB;
1410             fjptrC             = f+j_coord_offsetC;
1411             fjptrD             = f+j_coord_offsetD;
1412
1413             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1414                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1415
1416             /* Inner loop uses 250 flops */
1417         }
1418
1419         if(jidx<j_index_end)
1420         {
1421
1422             /* Get j neighbor index, and coordinate index */
1423             jnrlistA         = jjnr[jidx];
1424             jnrlistB         = jjnr[jidx+1];
1425             jnrlistC         = jjnr[jidx+2];
1426             jnrlistD         = jjnr[jidx+3];
1427             /* Sign of each element will be negative for non-real atoms.
1428              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1429              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1430              */
1431             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1432             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1433             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1434             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1435             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1436             j_coord_offsetA  = DIM*jnrA;
1437             j_coord_offsetB  = DIM*jnrB;
1438             j_coord_offsetC  = DIM*jnrC;
1439             j_coord_offsetD  = DIM*jnrD;
1440
1441             /* load j atom coordinates */
1442             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1443                                               x+j_coord_offsetC,x+j_coord_offsetD,
1444                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1445
1446             /* Calculate displacement vector */
1447             dx00             = _mm_sub_ps(ix0,jx0);
1448             dy00             = _mm_sub_ps(iy0,jy0);
1449             dz00             = _mm_sub_ps(iz0,jz0);
1450             dx01             = _mm_sub_ps(ix0,jx1);
1451             dy01             = _mm_sub_ps(iy0,jy1);
1452             dz01             = _mm_sub_ps(iz0,jz1);
1453             dx02             = _mm_sub_ps(ix0,jx2);
1454             dy02             = _mm_sub_ps(iy0,jy2);
1455             dz02             = _mm_sub_ps(iz0,jz2);
1456             dx10             = _mm_sub_ps(ix1,jx0);
1457             dy10             = _mm_sub_ps(iy1,jy0);
1458             dz10             = _mm_sub_ps(iz1,jz0);
1459             dx11             = _mm_sub_ps(ix1,jx1);
1460             dy11             = _mm_sub_ps(iy1,jy1);
1461             dz11             = _mm_sub_ps(iz1,jz1);
1462             dx12             = _mm_sub_ps(ix1,jx2);
1463             dy12             = _mm_sub_ps(iy1,jy2);
1464             dz12             = _mm_sub_ps(iz1,jz2);
1465             dx20             = _mm_sub_ps(ix2,jx0);
1466             dy20             = _mm_sub_ps(iy2,jy0);
1467             dz20             = _mm_sub_ps(iz2,jz0);
1468             dx21             = _mm_sub_ps(ix2,jx1);
1469             dy21             = _mm_sub_ps(iy2,jy1);
1470             dz21             = _mm_sub_ps(iz2,jz1);
1471             dx22             = _mm_sub_ps(ix2,jx2);
1472             dy22             = _mm_sub_ps(iy2,jy2);
1473             dz22             = _mm_sub_ps(iz2,jz2);
1474
1475             /* Calculate squared distance and things based on it */
1476             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1477             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1478             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1479             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1480             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1481             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1482             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1483             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1484             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1485
1486             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1487             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1488             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1489             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1490             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1491             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1492             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1493             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1494             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1495
1496             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1497             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1498             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1499             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1500             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1501             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1502             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1503             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1504             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1505
1506             fjx0             = _mm_setzero_ps();
1507             fjy0             = _mm_setzero_ps();
1508             fjz0             = _mm_setzero_ps();
1509             fjx1             = _mm_setzero_ps();
1510             fjy1             = _mm_setzero_ps();
1511             fjz1             = _mm_setzero_ps();
1512             fjx2             = _mm_setzero_ps();
1513             fjy2             = _mm_setzero_ps();
1514             fjz2             = _mm_setzero_ps();
1515
1516             /**************************
1517              * CALCULATE INTERACTIONS *
1518              **************************/
1519
1520             /* REACTION-FIELD ELECTROSTATICS */
1521             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1522
1523             /* LENNARD-JONES DISPERSION/REPULSION */
1524
1525             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1526             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1527
1528             fscal            = _mm_add_ps(felec,fvdw);
1529
1530             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1531
1532             /* Calculate temporary vectorial force */
1533             tx               = _mm_mul_ps(fscal,dx00);
1534             ty               = _mm_mul_ps(fscal,dy00);
1535             tz               = _mm_mul_ps(fscal,dz00);
1536
1537             /* Update vectorial force */
1538             fix0             = _mm_add_ps(fix0,tx);
1539             fiy0             = _mm_add_ps(fiy0,ty);
1540             fiz0             = _mm_add_ps(fiz0,tz);
1541
1542             fjx0             = _mm_add_ps(fjx0,tx);
1543             fjy0             = _mm_add_ps(fjy0,ty);
1544             fjz0             = _mm_add_ps(fjz0,tz);
1545
1546             /**************************
1547              * CALCULATE INTERACTIONS *
1548              **************************/
1549
1550             /* REACTION-FIELD ELECTROSTATICS */
1551             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1552
1553             fscal            = felec;
1554
1555             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1556
1557             /* Calculate temporary vectorial force */
1558             tx               = _mm_mul_ps(fscal,dx01);
1559             ty               = _mm_mul_ps(fscal,dy01);
1560             tz               = _mm_mul_ps(fscal,dz01);
1561
1562             /* Update vectorial force */
1563             fix0             = _mm_add_ps(fix0,tx);
1564             fiy0             = _mm_add_ps(fiy0,ty);
1565             fiz0             = _mm_add_ps(fiz0,tz);
1566
1567             fjx1             = _mm_add_ps(fjx1,tx);
1568             fjy1             = _mm_add_ps(fjy1,ty);
1569             fjz1             = _mm_add_ps(fjz1,tz);
1570
1571             /**************************
1572              * CALCULATE INTERACTIONS *
1573              **************************/
1574
1575             /* REACTION-FIELD ELECTROSTATICS */
1576             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1577
1578             fscal            = felec;
1579
1580             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1581
1582             /* Calculate temporary vectorial force */
1583             tx               = _mm_mul_ps(fscal,dx02);
1584             ty               = _mm_mul_ps(fscal,dy02);
1585             tz               = _mm_mul_ps(fscal,dz02);
1586
1587             /* Update vectorial force */
1588             fix0             = _mm_add_ps(fix0,tx);
1589             fiy0             = _mm_add_ps(fiy0,ty);
1590             fiz0             = _mm_add_ps(fiz0,tz);
1591
1592             fjx2             = _mm_add_ps(fjx2,tx);
1593             fjy2             = _mm_add_ps(fjy2,ty);
1594             fjz2             = _mm_add_ps(fjz2,tz);
1595
1596             /**************************
1597              * CALCULATE INTERACTIONS *
1598              **************************/
1599
1600             /* REACTION-FIELD ELECTROSTATICS */
1601             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1602
1603             fscal            = felec;
1604
1605             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1606
1607             /* Calculate temporary vectorial force */
1608             tx               = _mm_mul_ps(fscal,dx10);
1609             ty               = _mm_mul_ps(fscal,dy10);
1610             tz               = _mm_mul_ps(fscal,dz10);
1611
1612             /* Update vectorial force */
1613             fix1             = _mm_add_ps(fix1,tx);
1614             fiy1             = _mm_add_ps(fiy1,ty);
1615             fiz1             = _mm_add_ps(fiz1,tz);
1616
1617             fjx0             = _mm_add_ps(fjx0,tx);
1618             fjy0             = _mm_add_ps(fjy0,ty);
1619             fjz0             = _mm_add_ps(fjz0,tz);
1620
1621             /**************************
1622              * CALCULATE INTERACTIONS *
1623              **************************/
1624
1625             /* REACTION-FIELD ELECTROSTATICS */
1626             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1627
1628             fscal            = felec;
1629
1630             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1631
1632             /* Calculate temporary vectorial force */
1633             tx               = _mm_mul_ps(fscal,dx11);
1634             ty               = _mm_mul_ps(fscal,dy11);
1635             tz               = _mm_mul_ps(fscal,dz11);
1636
1637             /* Update vectorial force */
1638             fix1             = _mm_add_ps(fix1,tx);
1639             fiy1             = _mm_add_ps(fiy1,ty);
1640             fiz1             = _mm_add_ps(fiz1,tz);
1641
1642             fjx1             = _mm_add_ps(fjx1,tx);
1643             fjy1             = _mm_add_ps(fjy1,ty);
1644             fjz1             = _mm_add_ps(fjz1,tz);
1645
1646             /**************************
1647              * CALCULATE INTERACTIONS *
1648              **************************/
1649
1650             /* REACTION-FIELD ELECTROSTATICS */
1651             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1652
1653             fscal            = felec;
1654
1655             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1656
1657             /* Calculate temporary vectorial force */
1658             tx               = _mm_mul_ps(fscal,dx12);
1659             ty               = _mm_mul_ps(fscal,dy12);
1660             tz               = _mm_mul_ps(fscal,dz12);
1661
1662             /* Update vectorial force */
1663             fix1             = _mm_add_ps(fix1,tx);
1664             fiy1             = _mm_add_ps(fiy1,ty);
1665             fiz1             = _mm_add_ps(fiz1,tz);
1666
1667             fjx2             = _mm_add_ps(fjx2,tx);
1668             fjy2             = _mm_add_ps(fjy2,ty);
1669             fjz2             = _mm_add_ps(fjz2,tz);
1670
1671             /**************************
1672              * CALCULATE INTERACTIONS *
1673              **************************/
1674
1675             /* REACTION-FIELD ELECTROSTATICS */
1676             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1677
1678             fscal            = felec;
1679
1680             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1681
1682             /* Calculate temporary vectorial force */
1683             tx               = _mm_mul_ps(fscal,dx20);
1684             ty               = _mm_mul_ps(fscal,dy20);
1685             tz               = _mm_mul_ps(fscal,dz20);
1686
1687             /* Update vectorial force */
1688             fix2             = _mm_add_ps(fix2,tx);
1689             fiy2             = _mm_add_ps(fiy2,ty);
1690             fiz2             = _mm_add_ps(fiz2,tz);
1691
1692             fjx0             = _mm_add_ps(fjx0,tx);
1693             fjy0             = _mm_add_ps(fjy0,ty);
1694             fjz0             = _mm_add_ps(fjz0,tz);
1695
1696             /**************************
1697              * CALCULATE INTERACTIONS *
1698              **************************/
1699
1700             /* REACTION-FIELD ELECTROSTATICS */
1701             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1702
1703             fscal            = felec;
1704
1705             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1706
1707             /* Calculate temporary vectorial force */
1708             tx               = _mm_mul_ps(fscal,dx21);
1709             ty               = _mm_mul_ps(fscal,dy21);
1710             tz               = _mm_mul_ps(fscal,dz21);
1711
1712             /* Update vectorial force */
1713             fix2             = _mm_add_ps(fix2,tx);
1714             fiy2             = _mm_add_ps(fiy2,ty);
1715             fiz2             = _mm_add_ps(fiz2,tz);
1716
1717             fjx1             = _mm_add_ps(fjx1,tx);
1718             fjy1             = _mm_add_ps(fjy1,ty);
1719             fjz1             = _mm_add_ps(fjz1,tz);
1720
1721             /**************************
1722              * CALCULATE INTERACTIONS *
1723              **************************/
1724
1725             /* REACTION-FIELD ELECTROSTATICS */
1726             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1727
1728             fscal            = felec;
1729
1730             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1731
1732             /* Calculate temporary vectorial force */
1733             tx               = _mm_mul_ps(fscal,dx22);
1734             ty               = _mm_mul_ps(fscal,dy22);
1735             tz               = _mm_mul_ps(fscal,dz22);
1736
1737             /* Update vectorial force */
1738             fix2             = _mm_add_ps(fix2,tx);
1739             fiy2             = _mm_add_ps(fiy2,ty);
1740             fiz2             = _mm_add_ps(fiz2,tz);
1741
1742             fjx2             = _mm_add_ps(fjx2,tx);
1743             fjy2             = _mm_add_ps(fjy2,ty);
1744             fjz2             = _mm_add_ps(fjz2,tz);
1745
1746             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1747             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1748             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1749             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1750
1751             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1752                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1753
1754             /* Inner loop uses 250 flops */
1755         }
1756
1757         /* End of innermost loop */
1758
1759         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1760                                               f+i_coord_offset,fshift+i_shift_offset);
1761
1762         /* Increment number of inner iterations */
1763         inneriter                  += j_index_end - j_index_start;
1764
1765         /* Outer loop uses 18 flops */
1766     }
1767
1768     /* Increment number of outer iterations */
1769     outeriter        += nri;
1770
1771     /* Update outer/inner flops */
1772
1773     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);
1774 }