Merge "removed group non-boneded call with verlet scheme" into release-4-6
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_template_sse4_1_single.pre
1 /* #if 0 */
2 #error This file must be processed with the Gromacs pre-preprocessor
3 /* #endif */
4 /* #if INCLUDE_HEADER */
5 #ifdef HAVE_CONFIG_H
6 #include <config.h>
7 #endif
8
9 #include <math.h>
10
11 #include "../nb_kernel.h"
12 #include "types/simple.h"
13 #include "vec.h"
14 #include "nrnb.h"
15
16 #include "gmx_math_x86_sse4_1_single.h"
17 #include "kernelutil_x86_sse4_1_single.h"
18 /* #endif */
19
20 /* ## List of variables set by the generating script:                                    */
21 /* ##                                                                                    */
22 /* ## Setttings that apply to the entire kernel:                                         */
23 /* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
24 /* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
25 /* ## KERNEL_NAME:           String, name of this kernel                                 */
26 /* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
27 /* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
28 /* ##                                                                                    */
29 /* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
30 /* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
31 /* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
32 /* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
33 /* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
34 /* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
35 /* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
36 /* ##                                                                                    */
37 /* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
38 /* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
39 /* ##                        should be calculated in this kernel. Zero-charge particles  */
40 /* ##                        do not have interactions with particles without vdw, and    */
41 /* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
42 /* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
43 /* ##                        For each i-j pair, the element [I][J] is a list of strings  */
44 /* ##                        defining properties/flags of this interaction. Examples     */
45 /* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
46 /* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
47 /* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
48 /* ##                        decide if the force/potential should be modified. This way  */
49 /* ##                        we only calculate values absolutely needed for each case.   */
50
51 /* ## Calculate the size and offset for (merged/interleaved) table data */
52
53 /*
54  * Gromacs nonbonded kernel:   {KERNEL_NAME}
55  * Electrostatics interaction: {KERNEL_ELEC}
56  * VdW interaction:            {KERNEL_VDW}
57  * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
58  * Calculate force/pot:        {KERNEL_VF}
59  */
60 void
61 {KERNEL_NAME}
62                     (t_nblist * gmx_restrict                nlist,
63                      rvec * gmx_restrict                    xx,
64                      rvec * gmx_restrict                    ff,
65                      t_forcerec * gmx_restrict              fr,
66                      t_mdatoms * gmx_restrict               mdatoms,
67                      nb_kernel_data_t * gmx_restrict        kernel_data,
68                      t_nrnb * gmx_restrict                  nrnb)
69 {
70     /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
71     /* ## so there is no point in going to extremes to exclude variables that are not needed. */
72     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
73      * just 0 for non-waters.
74      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
75      * jnr indices corresponding to data put in the four positions in the SIMD register.
76      */
77     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
78     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
79     int              jnrA,jnrB,jnrC,jnrD;
80     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
81     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
86     real             scratch[4*DIM];
87     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     /* #for I in PARTICLES_I */
89     int              vdwioffset{I};
90     __m128           ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
91     /* #endfor */
92     /* #for J in PARTICLES_J */
93     int              vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D;
94     __m128           jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
95     /* #endfor */
96     /* #for I,J in PAIRS_IJ */
97     __m128           dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
98     /* #endfor */
99     /* #if KERNEL_ELEC != 'None' */
100     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
101     real             *charge;
102     /* #endif */
103     /* #if 'GeneralizedBorn' in KERNEL_ELEC */
104     __m128i          gbitab;
105     __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
106     __m128           minushalf = _mm_set1_ps(-0.5);
107     real             *invsqrta,*dvda,*gbtab;
108     /* #endif */
109     /* #if KERNEL_VDW != 'None' */
110     int              nvdwtype;
111     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
115     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
116     /* #endif */
117     /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
118     __m128i          vfitab;
119     __m128i          ifour       = _mm_set1_epi32(4);
120     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
121     real             *vftab;
122     /* #endif */
123     /* #if 'Ewald' in KERNEL_ELEC */
124     __m128i          ewitab;
125     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
126     real             *ewtab;
127     /* #endif */
128     /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
129     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
130     real             rswitch_scalar,d_scalar;
131     /* #endif */
132     __m128           dummy_mask,cutoff_mask;
133     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
134     __m128           one     = _mm_set1_ps(1.0);
135     __m128           two     = _mm_set1_ps(2.0);
136     x                = xx[0];
137     f                = ff[0];
138
139     nri              = nlist->nri;
140     iinr             = nlist->iinr;
141     jindex           = nlist->jindex;
142     jjnr             = nlist->jjnr;
143     shiftidx         = nlist->shift;
144     gid              = nlist->gid;
145     shiftvec         = fr->shift_vec[0];
146     fshift           = fr->fshift[0];
147     /* #if KERNEL_ELEC != 'None' */
148     facel            = _mm_set1_ps(fr->epsfac);
149     charge           = mdatoms->chargeA;
150     /*     #if 'ReactionField' in KERNEL_ELEC */
151     krf              = _mm_set1_ps(fr->ic->k_rf);
152     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
153     crf              = _mm_set1_ps(fr->ic->c_rf);
154     /*     #endif */
155     /* #endif */
156     /* #if KERNEL_VDW != 'None' */
157     nvdwtype         = fr->ntype;
158     vdwparam         = fr->nbfp;
159     vdwtype          = mdatoms->typeA;
160     /* #endif */
161
162     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
163     vftab            = kernel_data->table_elec_vdw->data;
164     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
165     /* #elif 'Table' in KERNEL_ELEC */
166     vftab            = kernel_data->table_elec->data;
167     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
168     /* #elif 'Table' in KERNEL_VDW */
169     vftab            = kernel_data->table_vdw->data;
170     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
171     /* #endif */
172
173     /* #if 'Ewald' in KERNEL_ELEC */
174     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
175     /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
176     ewtab            = fr->ic->tabq_coul_F;
177     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
178     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
179     /*     #else */
180     ewtab            = fr->ic->tabq_coul_FDV0;
181     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
182     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
183      /*     #endif */
184     /* #endif */
185
186     /* #if KERNEL_ELEC=='GeneralizedBorn' */
187     invsqrta         = fr->invsqrta;
188     dvda             = fr->dvda;
189     gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
190     gbtab            = fr->gbtab.data;
191     gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
192     /* #endif */
193
194     /* #if 'Water' in GEOMETRY_I */
195     /* Setup water-specific parameters */
196     inr              = nlist->iinr[0];
197     /*     #for I in PARTICLES_ELEC_I */
198     iq{I}              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+{I}]));
199     /*     #endfor */
200     /*     #for I in PARTICLES_VDW_I */
201     vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
202     /*     #endfor */
203     /* #endif */
204
205     /* #if 'Water' in GEOMETRY_J */
206     /*     #for J in PARTICLES_ELEC_J */
207     jq{J}              = _mm_set1_ps(charge[inr+{J}]);
208     /*     #endfor */
209     /*     #for J in PARTICLES_VDW_J */
210     vdwjidx{J}A        = 2*vdwtype[inr+{J}];
211     /*     #endfor */
212     /*     #for I,J in PAIRS_IJ */
213     /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
214     qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
215     /*         #endif */
216     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
217     c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
218     c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
219     /*         #endif */
220     /*     #endfor */
221     /* #endif */
222
223     /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
224     /*     #if KERNEL_ELEC!='None' */
225     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
226     rcutoff_scalar   = fr->rcoulomb;
227     /*     #else */
228     rcutoff_scalar   = fr->rvdw;
229     /*     #endif */
230     rcutoff          = _mm_set1_ps(rcutoff_scalar);
231     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
232     /* #endif */
233
234     /* #if KERNEL_MOD_VDW=='PotentialShift' */
235     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
236     rvdw             = _mm_set1_ps(fr->rvdw);
237     /* #endif */
238
239     /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
240     /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
241     rswitch_scalar   = fr->rcoulomb_switch;
242     rswitch          = _mm_set1_ps(rswitch_scalar);
243     /*     #else */
244     rswitch_scalar   = fr->rvdw_switch;
245     rswitch          = _mm_set1_ps(rswitch_scalar);
246     /*     #endif */
247     /* Setup switch parameters */
248     d_scalar         = rcutoff_scalar-rswitch_scalar;
249     d                = _mm_set1_ps(d_scalar);
250     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
251     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
252     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
253     /*     #if 'Force' in KERNEL_VF */
254     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
255     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
256     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
257     /*     #endif */
258     /* #endif */
259
260     /* Avoid stupid compiler warnings */
261     jnrA = jnrB = jnrC = jnrD = 0;
262     j_coord_offsetA = 0;
263     j_coord_offsetB = 0;
264     j_coord_offsetC = 0;
265     j_coord_offsetD = 0;
266
267     /* ## Keep track of the floating point operations we issue for reporting! */
268     /* #define OUTERFLOPS 0 */
269     outeriter        = 0;
270     inneriter        = 0;
271
272     for(iidx=0;iidx<4*DIM;iidx++)
273     {
274         scratch[iidx] = 0.0;
275     }
276
277     /* Start outer loop over neighborlists */
278     for(iidx=0; iidx<nri; iidx++)
279     {
280         /* Load shift vector for this list */
281         i_shift_offset   = DIM*shiftidx[iidx];
282
283         /* Load limits for loop over neighbors */
284         j_index_start    = jindex[iidx];
285         j_index_end      = jindex[iidx+1];
286
287         /* Get outer coordinate index */
288         inr              = iinr[iidx];
289         i_coord_offset   = DIM*inr;
290
291         /* Load i particle coords and add shift vector */
292         /* #if GEOMETRY_I == 'Particle' */
293         gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
294         /* #elif GEOMETRY_I == 'Water3' */
295         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
296                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
297         /* #elif GEOMETRY_I == 'Water4' */
298         /*     #if 0 in PARTICLES_I                 */
299         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
300                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
301         /*     #else                                */
302         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
303                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
304         /*     #endif                               */
305         /* #endif                                   */
306
307         /* #if 'Force' in KERNEL_VF */
308         /*     #for I in PARTICLES_I */
309         fix{I}             = _mm_setzero_ps();
310         fiy{I}             = _mm_setzero_ps();
311         fiz{I}             = _mm_setzero_ps();
312         /*     #endfor */
313         /* #endif */
314
315         /* ## For water we already preloaded parameters at the start of the kernel */
316         /* #if not 'Water' in GEOMETRY_I */
317         /* Load parameters for i particles */
318         /*     #for I in PARTICLES_ELEC_I */
319         iq{I}              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+{I}));
320         /*         #define OUTERFLOPS OUTERFLOPS+1 */
321         /*         #if KERNEL_ELEC=='GeneralizedBorn' */
322         isai{I}            = _mm_load1_ps(invsqrta+inr+{I});
323         /*         #endif */
324         /*     #endfor */
325         /*     #for I in PARTICLES_VDW_I */
326         vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
327         /*     #endfor */
328         /* #endif */
329
330         /* #if 'Potential' in KERNEL_VF */
331         /* Reset potential sums */
332         /*     #if KERNEL_ELEC != 'None' */
333         velecsum         = _mm_setzero_ps();
334         /*     #endif */
335         /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
336         vgbsum           = _mm_setzero_ps();
337         /*     #endif */
338         /*     #if KERNEL_VDW != 'None' */
339         vvdwsum          = _mm_setzero_ps();
340         /*     #endif */
341         /* #endif */
342         /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
343         dvdasum          = _mm_setzero_ps();
344         /*     #endif */
345
346         /* #for ROUND in ['Loop','Epilogue'] */
347
348         /* #if ROUND =='Loop' */
349         /* Start inner kernel loop */
350         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
351         {
352         /* ## First round is normal loop (next statement resets indentation) */
353         /*     #if 0 */
354         }
355         /*     #endif */
356         /* #else */
357         if(jidx<j_index_end)
358         {
359         /* ## Second round is epilogue */
360         /* #endif */
361         /* #define INNERFLOPS 0 */
362
363             /* Get j neighbor index, and coordinate index */
364             /* #if ROUND =='Loop' */
365             jnrA             = jjnr[jidx];
366             jnrB             = jjnr[jidx+1];
367             jnrC             = jjnr[jidx+2];
368             jnrD             = jjnr[jidx+3];
369             /* #else */
370             jnrlistA         = jjnr[jidx];
371             jnrlistB         = jjnr[jidx+1];
372             jnrlistC         = jjnr[jidx+2];
373             jnrlistD         = jjnr[jidx+3];
374             /* Sign of each element will be negative for non-real atoms.
375              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
376              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
377              */
378             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
379             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
380             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
381             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
382             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
383             /* #endif */
384             j_coord_offsetA  = DIM*jnrA;
385             j_coord_offsetB  = DIM*jnrB;
386             j_coord_offsetC  = DIM*jnrC;
387             j_coord_offsetD  = DIM*jnrD;
388
389             /* load j atom coordinates */
390             /* #if GEOMETRY_J == 'Particle'             */
391             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
392                                               x+j_coord_offsetC,x+j_coord_offsetD,
393                                               &jx0,&jy0,&jz0);
394             /* #elif GEOMETRY_J == 'Water3'             */
395             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
396                                               x+j_coord_offsetC,x+j_coord_offsetD,
397                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
398             /* #elif GEOMETRY_J == 'Water4'             */
399             /*     #if 0 in PARTICLES_J                 */
400             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
401                                               x+j_coord_offsetC,x+j_coord_offsetD,
402                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
403                                               &jy2,&jz2,&jx3,&jy3,&jz3);
404             /*     #else                                */
405             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
406                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
407                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
408             /*     #endif                               */
409             /* #endif                                   */
410
411             /* Calculate displacement vector */
412             /* #for I,J in PAIRS_IJ */
413             dx{I}{J}             = _mm_sub_ps(ix{I},jx{J});
414             dy{I}{J}             = _mm_sub_ps(iy{I},jy{J});
415             dz{I}{J}             = _mm_sub_ps(iz{I},jz{J});
416             /*     #define INNERFLOPS INNERFLOPS+3 */
417             /* #endfor */
418
419             /* Calculate squared distance and things based on it */
420             /* #for I,J in PAIRS_IJ */
421             rsq{I}{J}            = gmx_mm_calc_rsq_ps(dx{I}{J},dy{I}{J},dz{I}{J});
422             /*     #define INNERFLOPS INNERFLOPS+5 */
423             /* #endfor */
424
425             /* #for I,J in PAIRS_IJ */
426             /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
427             rinv{I}{J}           = gmx_mm_invsqrt_ps(rsq{I}{J});
428             /*         #define INNERFLOPS INNERFLOPS+5 */
429             /*     #endif */
430             /* #endfor */
431
432             /* #for I,J in PAIRS_IJ */
433             /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
434             /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
435             rinvsq{I}{J}         = gmx_mm_inv_ps(rsq{I}{J});
436             /*             #define INNERFLOPS INNERFLOPS+4 */
437             /*         #else */
438             rinvsq{I}{J}         = _mm_mul_ps(rinv{I}{J},rinv{I}{J});
439             /*             #define INNERFLOPS INNERFLOPS+1 */
440             /*         #endif */
441             /*     #endif */
442             /* #endfor */
443
444             /* #if not 'Water' in GEOMETRY_J */
445             /* Load parameters for j particles */
446             /*     #for J in PARTICLES_ELEC_J */
447             jq{J}              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+{J},charge+jnrB+{J},
448                                                               charge+jnrC+{J},charge+jnrD+{J});
449             /*         #if KERNEL_ELEC=='GeneralizedBorn' */
450             isaj{J}            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
451                                                               invsqrta+jnrC+{J},invsqrta+jnrD+{J});
452             /*         #endif */
453             /*     #endfor */
454             /*     #for J in PARTICLES_VDW_J */
455             vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
456             vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
457             vdwjidx{J}C        = 2*vdwtype[jnrC+{J}];
458             vdwjidx{J}D        = 2*vdwtype[jnrD+{J}];
459             /*     #endfor */
460             /* #endif */
461
462             /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_J */
463             /*     #for J in PARTICLES_J */
464             fjx{J}             = _mm_setzero_ps();
465             fjy{J}             = _mm_setzero_ps();
466             fjz{J}             = _mm_setzero_ps();
467             /*     #endfor */
468             /* #endif */
469
470             /* #for I,J in PAIRS_IJ */
471
472             /**************************
473              * CALCULATE INTERACTIONS *
474              **************************/
475
476             /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
477             /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
478             /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
479             if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
480             {
481                 /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
482             }
483             /*         #endif */
484             /*         #define INNERFLOPS INNERFLOPS+1 */
485             /*     #endif */
486
487             /*     #if 'r' in INTERACTION_FLAGS[I][J] */
488             r{I}{J}              = _mm_mul_ps(rsq{I}{J},rinv{I}{J});
489             /*         #if ROUND == 'Epilogue' */
490             r{I}{J}              = _mm_andnot_ps(dummy_mask,r{I}{J});
491             /*             #define INNERFLOPS INNERFLOPS+1 */
492             /*         #endif */
493             /*         #define INNERFLOPS INNERFLOPS+1 */
494             /*     #endif */
495
496             /*     ## For water geometries we already loaded parameters at the start of the kernel */
497             /*     #if not 'Water' in GEOMETRY_J */
498             /* Compute parameters for interactions between i and j atoms */
499             /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
500             qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
501             /*             #define INNERFLOPS INNERFLOPS+1 */
502             /*         #endif */
503             /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
504             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset{I}+vdwjidx{J}A,
505                                          vdwparam+vdwioffset{I}+vdwjidx{J}B,
506                                          vdwparam+vdwioffset{I}+vdwjidx{J}C,
507                                          vdwparam+vdwioffset{I}+vdwjidx{J}D,
508                                          &c6_{I}{J},&c12_{I}{J});
509             /*         #endif */
510             /*     #endif */
511
512             /*     #if 'table' in INTERACTION_FLAGS[I][J] */
513             /* Calculate table index by multiplying r with table scale and truncate to integer */
514             rt               = _mm_mul_ps(r{I}{J},vftabscale);
515             vfitab           = _mm_cvttps_epi32(rt);
516             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
517             /*         #define INNERFLOPS INNERFLOPS+4                          */
518             /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
519             /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
520             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
521             /*         #elif 'Table' in KERNEL_ELEC                             */
522             /*             ## 1 table, 4 bytes per point: multiply index by 4   */
523             vfitab           = _mm_slli_epi32(vfitab,2);
524             /*         #elif 'Table' in KERNEL_VDW                              */
525             /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
526             vfitab           = _mm_slli_epi32(vfitab,3);
527             /*         #endif                                                   */
528             /*     #endif */
529
530             /*     ## ELECTROSTATIC INTERACTIONS */
531             /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
532
533             /*         #if KERNEL_ELEC=='Coulomb' */
534
535             /* COULOMB ELECTROSTATICS */
536             velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
537             /*             #define INNERFLOPS INNERFLOPS+1 */
538             /*             #if 'Force' in KERNEL_VF */
539             felec            = _mm_mul_ps(velec,rinvsq{I}{J});
540             /*                 #define INNERFLOPS INNERFLOPS+2 */
541             /*             #endif */
542
543             /*         #elif KERNEL_ELEC=='ReactionField' */
544
545             /* REACTION-FIELD ELECTROSTATICS */
546             /*             #if 'Potential' in KERNEL_VF */
547             velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_add_ps(rinv{I}{J},_mm_mul_ps(krf,rsq{I}{J})),crf));
548             /*                 #define INNERFLOPS INNERFLOPS+4 */
549             /*             #endif */
550             /*             #if 'Force' in KERNEL_VF */
551             felec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_mul_ps(rinv{I}{J},rinvsq{I}{J}),krf2));
552             /*                 #define INNERFLOPS INNERFLOPS+3 */
553             /*             #endif */
554
555             /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
556
557             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
558             isaprod          = _mm_mul_ps(isai{I},isaj{J});
559             gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
560             gbscale          = _mm_mul_ps(isaprod,gbtabscale);
561             /*             #define INNERFLOPS INNERFLOPS+5 */
562
563             /* Calculate generalized born table index - this is a separate table from the normal one,
564              * but we use the same procedure by multiplying r with scale and truncating to integer.
565              */
566             rt               = _mm_mul_ps(r{I}{J},gbscale);
567             gbitab           = _mm_cvttps_epi32(rt);
568             gbeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
569             gbitab           = _mm_slli_epi32(gbitab,2);
570             Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
571             F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
572             G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
573             H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
574             _MM_TRANSPOSE4_PS(Y,F,G,H);
575             Heps             = _mm_mul_ps(gbeps,H);
576             Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
577             VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
578             vgb              = _mm_mul_ps(gbqqfactor,VV);
579             /*             #define INNERFLOPS INNERFLOPS+10 */
580
581             /*             #if 'Force' in KERNEL_VF */
582             FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
583             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
584             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
585             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
586             /*                 #if ROUND == 'Loop' */
587             fjptrA           = dvda+jnrA;
588             fjptrB           = dvda+jnrB;
589             fjptrC           = dvda+jnrC;
590             fjptrD           = dvda+jnrD;
591             /*                 #else */
592             /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
593             fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
594             fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
595             fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
596             fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
597             /*                 #endif */
598             gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
599             /*                 #define INNERFLOPS INNERFLOPS+13 */
600             /*             #endif */
601             velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
602             /*                 #define INNERFLOPS INNERFLOPS+1 */
603             /*             #if 'Force' in KERNEL_VF */
604             felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv{I}{J}),fgb),rinv{I}{J});
605             /*                 #define INNERFLOPS INNERFLOPS+3 */
606             /*             #endif */
607
608             /*         #elif KERNEL_ELEC=='Ewald' */
609             /* EWALD ELECTROSTATICS */
610
611             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
612             ewrt             = _mm_mul_ps(r{I}{J},ewtabscale);
613             ewitab           = _mm_cvttps_epi32(ewrt);
614             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
615             /*             #define INNERFLOPS INNERFLOPS+4 */
616             /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
617             ewitab           = _mm_slli_epi32(ewitab,2);
618             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
619             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
620             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
621             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
622             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
623             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
624             /*                 #define INNERFLOPS INNERFLOPS+2 */
625             /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
626             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
627             velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_sub_ps(rinv{I}{J},sh_ewald),velec));
628             /*                     #define INNERFLOPS INNERFLOPS+7 */
629             /*                 #else */
630             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
631             velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(rinv{I}{J},velec));
632             /*                     #define INNERFLOPS INNERFLOPS+6 */
633             /*                 #endif */
634             /*                 #if 'Force' in KERNEL_VF */
635             felec            = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
636             /*                      #define INNERFLOPS INNERFLOPS+3 */
637             /*                 #endif */
638             /*             #elif KERNEL_VF=='Force' */
639             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
640                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
641                                          &ewtabF,&ewtabFn);
642             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
643             felec            = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
644             /*                 #define INNERFLOPS INNERFLOPS+7 */
645             /*             #endif */
646
647             /*         #elif KERNEL_ELEC=='CubicSplineTable' */
648
649             /* CUBIC SPLINE TABLE ELECTROSTATICS */
650             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
651             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
652             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
653             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
654             _MM_TRANSPOSE4_PS(Y,F,G,H);
655             Heps             = _mm_mul_ps(vfeps,H);
656             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
657             /*             #define INNERFLOPS INNERFLOPS+4 */
658             /*             #if 'Potential' in KERNEL_VF */
659             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
660             velec            = _mm_mul_ps(qq{I}{J},VV);
661             /*                 #define INNERFLOPS INNERFLOPS+3 */
662             /*             #endif */
663             /*             #if 'Force' in KERNEL_VF */
664             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
665             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq{I}{J},FF),_mm_mul_ps(vftabscale,rinv{I}{J})));
666             /*                 #define INNERFLOPS INNERFLOPS+7 */
667             /*             #endif */
668             /*         #endif */
669             /*         ## End of check for electrostatics interaction forms */
670             /*     #endif */
671             /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
672
673             /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
674
675             /*         #if KERNEL_VDW=='LennardJones' */
676
677             /* LENNARD-JONES DISPERSION/REPULSION */
678
679             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
680             /*             #define INNERFLOPS INNERFLOPS+2 */
681             /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
682             vvdw6            = _mm_mul_ps(c6_{I}{J},rinvsix);
683             vvdw12           = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
684             /*                 #define INNERFLOPS INNERFLOPS+3 */
685             /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
686             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
687                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
688             /*                     #define INNERFLOPS INNERFLOPS+8 */
689             /*                 #else */
690             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
691             /*                     #define INNERFLOPS INNERFLOPS+3 */
692             /*                 #endif */
693             /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
694             /*                 #if 'Force' in KERNEL_VF */
695             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq{I}{J});
696             /*                     #define INNERFLOPS INNERFLOPS+2 */
697             /*                 #endif */
698             /*             #elif KERNEL_VF=='Force' */
699             /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
700             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_{I}{J},rinvsix),c6_{I}{J}),_mm_mul_ps(rinvsix,rinvsq{I}{J}));
701             /*                 #define INNERFLOPS INNERFLOPS+4 */
702             /*             #endif */
703
704             /*         #elif KERNEL_VDW=='CubicSplineTable' */
705
706             /* CUBIC SPLINE TABLE DISPERSION */
707             /*             #if 'Table' in KERNEL_ELEC */
708             vfitab           = _mm_add_epi32(vfitab,ifour);
709             /*             #endif                     */
710             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
711             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
712             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
713             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
714             _MM_TRANSPOSE4_PS(Y,F,G,H);
715             Heps             = _mm_mul_ps(vfeps,H);
716             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
717             /*             #define INNERFLOPS INNERFLOPS+4 */
718             /*             #if 'Potential' in KERNEL_VF */
719             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
720             vvdw6            = _mm_mul_ps(c6_{I}{J},VV);
721             /*                 #define INNERFLOPS INNERFLOPS+3 */
722             /*             #endif */
723             /*             #if 'Force' in KERNEL_VF */
724             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
725             fvdw6            = _mm_mul_ps(c6_{I}{J},FF);
726             /*                 #define INNERFLOPS INNERFLOPS+4 */
727             /*             #endif */
728
729             /* CUBIC SPLINE TABLE REPULSION */
730             vfitab           = _mm_add_epi32(vfitab,ifour);
731             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
732             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
733             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
734             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
735             _MM_TRANSPOSE4_PS(Y,F,G,H);
736             Heps             = _mm_mul_ps(vfeps,H);
737             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
738             /*             #define INNERFLOPS INNERFLOPS+4 */
739             /*             #if 'Potential' in KERNEL_VF */
740             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
741             vvdw12           = _mm_mul_ps(c12_{I}{J},VV);
742             /*                 #define INNERFLOPS INNERFLOPS+3 */
743             /*             #endif */
744             /*             #if 'Force' in KERNEL_VF */
745             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
746             fvdw12           = _mm_mul_ps(c12_{I}{J},FF);
747             /*                 #define INNERFLOPS INNERFLOPS+5 */
748             /*             #endif */
749             /*             #if 'Potential' in KERNEL_VF */
750             vvdw             = _mm_add_ps(vvdw12,vvdw6);
751             /*                 #define INNERFLOPS INNERFLOPS+1 */
752             /*             #endif */
753             /*             #if 'Force' in KERNEL_VF */
754             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
755             /*                 #define INNERFLOPS INNERFLOPS+4 */
756             /*             #endif */
757             /*         #endif */
758             /*         ## End of check for vdw interaction forms */
759             /*     #endif */
760             /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
761
762             /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
763             d                = _mm_sub_ps(r{I}{J},rswitch);
764             d                = _mm_max_ps(d,_mm_setzero_ps());
765             d2               = _mm_mul_ps(d,d);
766             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
767             /*         #define INNERFLOPS INNERFLOPS+10 */
768
769             /*         #if 'Force' in KERNEL_VF */
770             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
771             /*             #define INNERFLOPS INNERFLOPS+5 */
772             /*         #endif */
773
774             /* Evaluate switch function */
775             /*         #if 'Force' in KERNEL_VF */
776             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
777             /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
778             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(velec,dsw)) );
779             /*                 #define INNERFLOPS INNERFLOPS+4 */
780             /*             #endif */
781             /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
782             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(vvdw,dsw)) );
783             /*                 #define INNERFLOPS INNERFLOPS+4 */
784             /*             #endif */
785             /*         #endif */
786             /*         #if 'Potential' in KERNEL_VF */
787             /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
788             velec            = _mm_mul_ps(velec,sw);
789             /*                 #define INNERFLOPS INNERFLOPS+1 */
790             /*             #endif */
791             /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
792             vvdw             = _mm_mul_ps(vvdw,sw);
793             /*                 #define INNERFLOPS INNERFLOPS+1 */
794             /*             #endif */
795             /*         #endif */
796             /*     #endif */
797             /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
798             /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
799             cutoff_mask      = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
800             /*         #define INNERFLOPS INNERFLOPS+1 */
801             /*     #endif */
802
803             /*     #if 'Potential' in KERNEL_VF */
804             /* Update potential sum for this i atom from the interaction with this j atom. */
805             /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
806             /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
807             velec            = _mm_and_ps(velec,cutoff_mask);
808             /*                 #define INNERFLOPS INNERFLOPS+1 */
809             /*             #endif                                       */
810             /*             #if ROUND == 'Epilogue' */
811             velec            = _mm_andnot_ps(dummy_mask,velec);
812             /*             #endif */
813             velecsum         = _mm_add_ps(velecsum,velec);
814             /*             #define INNERFLOPS INNERFLOPS+1 */
815             /*             #if KERNEL_ELEC=='GeneralizedBorn' */
816             /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
817             vgb              = _mm_and_ps(vgb,cutoff_mask);
818             /*                 #define INNERFLOPS INNERFLOPS+1 */
819             /*             #endif                                       */
820             /*             #if ROUND == 'Epilogue' */
821             vgb              = _mm_andnot_ps(dummy_mask,vgb);
822             /*             #endif */
823             vgbsum           = _mm_add_ps(vgbsum,vgb);
824             /*                 #define INNERFLOPS INNERFLOPS+1 */
825             /*             #endif */
826             /*         #endif */
827             /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
828             /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
829             /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
830             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
831             /*                 #define INNERFLOPS INNERFLOPS+1 */
832             /*             #endif                                       */
833             /*             #if ROUND == 'Epilogue' */
834             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
835             /*             #endif */
836             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
837             /*             #define INNERFLOPS INNERFLOPS+1 */
838             /*         #endif */
839             /*     #endif */
840
841             /*     #if 'Force' in KERNEL_VF */
842
843             /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
844             fscal            = _mm_add_ps(felec,fvdw);
845             /*             #define INNERFLOPS INNERFLOPS+1 */
846             /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
847             fscal            = felec;
848             /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
849             fscal            = fvdw;
850             /*        #endif */
851
852             /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
853             /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
854             fscal            = _mm_and_ps(fscal,cutoff_mask);
855             /*                 #define INNERFLOPS INNERFLOPS+1 */
856             /*             #endif                                       */
857
858             /*             #if ROUND == 'Epilogue' */
859             fscal            = _mm_andnot_ps(dummy_mask,fscal);
860             /*             #endif */
861
862             /* Calculate temporary vectorial force */
863             tx               = _mm_mul_ps(fscal,dx{I}{J});
864             ty               = _mm_mul_ps(fscal,dy{I}{J});
865             tz               = _mm_mul_ps(fscal,dz{I}{J});
866
867             /* Update vectorial force */
868             fix{I}             = _mm_add_ps(fix{I},tx);
869             fiy{I}             = _mm_add_ps(fiy{I},ty);
870             fiz{I}             = _mm_add_ps(fiz{I},tz);
871             /*             #define INNERFLOPS INNERFLOPS+6 */
872
873             /* #if GEOMETRY_J == 'Particle'             */
874             /*     #if ROUND == 'Loop' */
875             fjptrA             = f+j_coord_offsetA;
876             fjptrB             = f+j_coord_offsetB;
877             fjptrC             = f+j_coord_offsetC;
878             fjptrD             = f+j_coord_offsetD;
879             /*     #else */
880             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
881             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
882             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
883             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
884             /*     #endif */
885             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
886             /*     #define INNERFLOPS INNERFLOPS+3      */
887             /* #else                                    */
888             fjx{J}             = _mm_add_ps(fjx{J},tx);
889             fjy{J}             = _mm_add_ps(fjy{J},ty);
890             fjz{J}             = _mm_add_ps(fjz{J},tz);
891             /*     #define INNERFLOPS INNERFLOPS+3      */
892             /* #endif                                   */
893
894             /*     #endif */
895
896             /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
897             /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
898             /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
899             {
900                 /*     #endif */
901             }
902             /*     #endif */
903             /*    ## End of check for the interaction being outside the cutoff */
904
905             /* #endfor */
906             /* ## End of loop over i-j interaction pairs */
907
908             /* #if GEOMETRY_J != 'Particle' */
909             /*     #if ROUND == 'Loop' */
910             fjptrA             = f+j_coord_offsetA;
911             fjptrB             = f+j_coord_offsetB;
912             fjptrC             = f+j_coord_offsetC;
913             fjptrD             = f+j_coord_offsetD;
914             /*     #else */
915             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
916             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
917             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
918             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
919             /*     #endif */
920             /* #endif */
921
922             /* #if GEOMETRY_J == 'Water3'               */
923             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
924                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
925             /*     #define INNERFLOPS INNERFLOPS+9      */
926             /* #elif GEOMETRY_J == 'Water4'             */
927             /*     #if 0 in PARTICLES_J                 */
928             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
929                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
930                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
931             /*     #define INNERFLOPS INNERFLOPS+12     */
932             /*     #else                                */
933             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
934                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
935             /*     #define INNERFLOPS INNERFLOPS+9      */
936             /*     #endif                               */
937             /* #endif                                   */
938
939             /* Inner loop uses {INNERFLOPS} flops */
940         }
941
942         /* #endfor */
943
944         /* End of innermost loop */
945
946         /* #if 'Force' in KERNEL_VF */
947         /*     #if GEOMETRY_I == 'Particle'            */
948         gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
949                                               f+i_coord_offset,fshift+i_shift_offset);
950         /*         #define OUTERFLOPS OUTERFLOPS+6     */
951         /*     #elif GEOMETRY_I == 'Water3'            */
952         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
953                                               f+i_coord_offset,fshift+i_shift_offset);
954         /*         #define OUTERFLOPS OUTERFLOPS+18    */
955         /*     #elif GEOMETRY_I == 'Water4'            */
956         /*         #if 0 in PARTICLES_I                */
957         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
958                                               f+i_coord_offset,fshift+i_shift_offset);
959         /*             #define OUTERFLOPS OUTERFLOPS+24    */
960         /*         #else                               */
961         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
962                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
963         /*             #define OUTERFLOPS OUTERFLOPS+18    */
964         /*         #endif                              */
965         /*     #endif                                  */
966         /* #endif                                      */
967
968         /* #if 'Potential' in KERNEL_VF */
969         ggid                        = gid[iidx];
970         /* Update potential energies */
971         /*     #if KERNEL_ELEC != 'None' */
972         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
973         /*         #define OUTERFLOPS OUTERFLOPS+1 */
974         /*     #endif */
975         /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
976         gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
977         /*         #define OUTERFLOPS OUTERFLOPS+1 */
978         /*     #endif */
979         /*     #if KERNEL_VDW != 'None' */
980         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
981         /*         #define OUTERFLOPS OUTERFLOPS+1 */
982         /*     #endif */
983         /* #endif */
984         /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
985         dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai{I},isai{I}));
986         gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
987         /*     #endif */
988
989         /* Increment number of inner iterations */
990         inneriter                  += j_index_end - j_index_start;
991
992         /* Outer loop uses {OUTERFLOPS} flops */
993     }
994
995     /* Increment number of outer iterations */
996     outeriter        += nri;
997
998     /* Update outer/inner flops */
999     /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
1000     /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
1001     /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
1002     /* #if GEOMETRY_I == 'Water3'            */
1003     /*     #define ISUFFIX '_W3'             */
1004     /* #elif GEOMETRY_I == 'Water4'          */
1005     /*     #define ISUFFIX '_W4'             */
1006     /* #else                                 */
1007     /*     #define ISUFFIX ''                */
1008     /* #endif                                */
1009     /* #if GEOMETRY_J == 'Water3'            */
1010     /*     #define JSUFFIX 'W3'              */
1011     /* #elif GEOMETRY_J == 'Water4'          */
1012     /*     #define JSUFFIX 'W4'              */
1013     /* #else                                 */
1014     /*     #define JSUFFIX ''                */
1015     /* #endif                                */
1016     /* #if 'PotentialAndForce' in KERNEL_VF  */
1017     /*     #define VFSUFFIX  '_VF'           */
1018     /* #elif 'Potential' in KERNEL_VF        */
1019     /*     #define VFSUFFIX '_V'             */
1020     /* #else                                 */
1021     /*     #define VFSUFFIX '_F'             */
1022     /* #endif                                */
1023
1024     /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
1025     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
1026     /* #elif KERNEL_ELEC != 'None' */
1027     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
1028     /* #else */
1029     inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
1030     /* #endif  */
1031 }