Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCoul_VdwNone_GeomW3P1_avx_128_fma_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_double
38  * Electrostatics interaction: Coulomb
39  * VdW interaction:            None
40  * Geometry:                   Water3-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_avx_128_fma_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
75     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
76     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
77     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
78     real             *charge;
79     __m128d          dummy_mask,cutoff_mask;
80     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
81     __m128d          one     = _mm_set1_pd(1.0);
82     __m128d          two     = _mm_set1_pd(2.0);
83     x                = xx[0];
84     f                = ff[0];
85
86     nri              = nlist->nri;
87     iinr             = nlist->iinr;
88     jindex           = nlist->jindex;
89     jjnr             = nlist->jjnr;
90     shiftidx         = nlist->shift;
91     gid              = nlist->gid;
92     shiftvec         = fr->shift_vec[0];
93     fshift           = fr->fshift[0];
94     facel            = _mm_set1_pd(fr->epsfac);
95     charge           = mdatoms->chargeA;
96
97     /* Setup water-specific parameters */
98     inr              = nlist->iinr[0];
99     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
100     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
101     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
102
103     /* Avoid stupid compiler warnings */
104     jnrA = jnrB = 0;
105     j_coord_offsetA = 0;
106     j_coord_offsetB = 0;
107
108     outeriter        = 0;
109     inneriter        = 0;
110
111     /* Start outer loop over neighborlists */
112     for(iidx=0; iidx<nri; iidx++)
113     {
114         /* Load shift vector for this list */
115         i_shift_offset   = DIM*shiftidx[iidx];
116
117         /* Load limits for loop over neighbors */
118         j_index_start    = jindex[iidx];
119         j_index_end      = jindex[iidx+1];
120
121         /* Get outer coordinate index */
122         inr              = iinr[iidx];
123         i_coord_offset   = DIM*inr;
124
125         /* Load i particle coords and add shift vector */
126         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
127                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
128
129         fix0             = _mm_setzero_pd();
130         fiy0             = _mm_setzero_pd();
131         fiz0             = _mm_setzero_pd();
132         fix1             = _mm_setzero_pd();
133         fiy1             = _mm_setzero_pd();
134         fiz1             = _mm_setzero_pd();
135         fix2             = _mm_setzero_pd();
136         fiy2             = _mm_setzero_pd();
137         fiz2             = _mm_setzero_pd();
138
139         /* Reset potential sums */
140         velecsum         = _mm_setzero_pd();
141
142         /* Start inner kernel loop */
143         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
144         {
145
146             /* Get j neighbor index, and coordinate index */
147             jnrA             = jjnr[jidx];
148             jnrB             = jjnr[jidx+1];
149             j_coord_offsetA  = DIM*jnrA;
150             j_coord_offsetB  = DIM*jnrB;
151
152             /* load j atom coordinates */
153             gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
154                                               &jx0,&jy0,&jz0);
155
156             /* Calculate displacement vector */
157             dx00             = _mm_sub_pd(ix0,jx0);
158             dy00             = _mm_sub_pd(iy0,jy0);
159             dz00             = _mm_sub_pd(iz0,jz0);
160             dx10             = _mm_sub_pd(ix1,jx0);
161             dy10             = _mm_sub_pd(iy1,jy0);
162             dz10             = _mm_sub_pd(iz1,jz0);
163             dx20             = _mm_sub_pd(ix2,jx0);
164             dy20             = _mm_sub_pd(iy2,jy0);
165             dz20             = _mm_sub_pd(iz2,jz0);
166
167             /* Calculate squared distance and things based on it */
168             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
169             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
170             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
171
172             rinv00           = gmx_mm_invsqrt_pd(rsq00);
173             rinv10           = gmx_mm_invsqrt_pd(rsq10);
174             rinv20           = gmx_mm_invsqrt_pd(rsq20);
175
176             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
177             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
178             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
179
180             /* Load parameters for j particles */
181             jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
182
183             fjx0             = _mm_setzero_pd();
184             fjy0             = _mm_setzero_pd();
185             fjz0             = _mm_setzero_pd();
186
187             /**************************
188              * CALCULATE INTERACTIONS *
189              **************************/
190
191             /* Compute parameters for interactions between i and j atoms */
192             qq00             = _mm_mul_pd(iq0,jq0);
193
194             /* COULOMB ELECTROSTATICS */
195             velec            = _mm_mul_pd(qq00,rinv00);
196             felec            = _mm_mul_pd(velec,rinvsq00);
197
198             /* Update potential sum for this i atom from the interaction with this j atom. */
199             velecsum         = _mm_add_pd(velecsum,velec);
200
201             fscal            = felec;
202
203             /* Update vectorial force */
204             fix0             = _mm_macc_pd(dx00,fscal,fix0);
205             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
206             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
207             
208             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
209             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
210             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
211
212             /**************************
213              * CALCULATE INTERACTIONS *
214              **************************/
215
216             /* Compute parameters for interactions between i and j atoms */
217             qq10             = _mm_mul_pd(iq1,jq0);
218
219             /* COULOMB ELECTROSTATICS */
220             velec            = _mm_mul_pd(qq10,rinv10);
221             felec            = _mm_mul_pd(velec,rinvsq10);
222
223             /* Update potential sum for this i atom from the interaction with this j atom. */
224             velecsum         = _mm_add_pd(velecsum,velec);
225
226             fscal            = felec;
227
228             /* Update vectorial force */
229             fix1             = _mm_macc_pd(dx10,fscal,fix1);
230             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
231             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
232             
233             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
234             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
235             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
236
237             /**************************
238              * CALCULATE INTERACTIONS *
239              **************************/
240
241             /* Compute parameters for interactions between i and j atoms */
242             qq20             = _mm_mul_pd(iq2,jq0);
243
244             /* COULOMB ELECTROSTATICS */
245             velec            = _mm_mul_pd(qq20,rinv20);
246             felec            = _mm_mul_pd(velec,rinvsq20);
247
248             /* Update potential sum for this i atom from the interaction with this j atom. */
249             velecsum         = _mm_add_pd(velecsum,velec);
250
251             fscal            = felec;
252
253             /* Update vectorial force */
254             fix2             = _mm_macc_pd(dx20,fscal,fix2);
255             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
256             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
257             
258             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
259             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
260             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
261
262             gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
263
264             /* Inner loop uses 96 flops */
265         }
266
267         if(jidx<j_index_end)
268         {
269
270             jnrA             = jjnr[jidx];
271             j_coord_offsetA  = DIM*jnrA;
272
273             /* load j atom coordinates */
274             gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
275                                               &jx0,&jy0,&jz0);
276
277             /* Calculate displacement vector */
278             dx00             = _mm_sub_pd(ix0,jx0);
279             dy00             = _mm_sub_pd(iy0,jy0);
280             dz00             = _mm_sub_pd(iz0,jz0);
281             dx10             = _mm_sub_pd(ix1,jx0);
282             dy10             = _mm_sub_pd(iy1,jy0);
283             dz10             = _mm_sub_pd(iz1,jz0);
284             dx20             = _mm_sub_pd(ix2,jx0);
285             dy20             = _mm_sub_pd(iy2,jy0);
286             dz20             = _mm_sub_pd(iz2,jz0);
287
288             /* Calculate squared distance and things based on it */
289             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
290             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
291             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
292
293             rinv00           = gmx_mm_invsqrt_pd(rsq00);
294             rinv10           = gmx_mm_invsqrt_pd(rsq10);
295             rinv20           = gmx_mm_invsqrt_pd(rsq20);
296
297             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
298             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
299             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
300
301             /* Load parameters for j particles */
302             jq0              = _mm_load_sd(charge+jnrA+0);
303
304             fjx0             = _mm_setzero_pd();
305             fjy0             = _mm_setzero_pd();
306             fjz0             = _mm_setzero_pd();
307
308             /**************************
309              * CALCULATE INTERACTIONS *
310              **************************/
311
312             /* Compute parameters for interactions between i and j atoms */
313             qq00             = _mm_mul_pd(iq0,jq0);
314
315             /* COULOMB ELECTROSTATICS */
316             velec            = _mm_mul_pd(qq00,rinv00);
317             felec            = _mm_mul_pd(velec,rinvsq00);
318
319             /* Update potential sum for this i atom from the interaction with this j atom. */
320             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
321             velecsum         = _mm_add_pd(velecsum,velec);
322
323             fscal            = felec;
324
325             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
326
327             /* Update vectorial force */
328             fix0             = _mm_macc_pd(dx00,fscal,fix0);
329             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
330             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
331             
332             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
333             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
334             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
335
336             /**************************
337              * CALCULATE INTERACTIONS *
338              **************************/
339
340             /* Compute parameters for interactions between i and j atoms */
341             qq10             = _mm_mul_pd(iq1,jq0);
342
343             /* COULOMB ELECTROSTATICS */
344             velec            = _mm_mul_pd(qq10,rinv10);
345             felec            = _mm_mul_pd(velec,rinvsq10);
346
347             /* Update potential sum for this i atom from the interaction with this j atom. */
348             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
349             velecsum         = _mm_add_pd(velecsum,velec);
350
351             fscal            = felec;
352
353             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
354
355             /* Update vectorial force */
356             fix1             = _mm_macc_pd(dx10,fscal,fix1);
357             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
358             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
359             
360             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
361             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
362             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
363
364             /**************************
365              * CALCULATE INTERACTIONS *
366              **************************/
367
368             /* Compute parameters for interactions between i and j atoms */
369             qq20             = _mm_mul_pd(iq2,jq0);
370
371             /* COULOMB ELECTROSTATICS */
372             velec            = _mm_mul_pd(qq20,rinv20);
373             felec            = _mm_mul_pd(velec,rinvsq20);
374
375             /* Update potential sum for this i atom from the interaction with this j atom. */
376             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
377             velecsum         = _mm_add_pd(velecsum,velec);
378
379             fscal            = felec;
380
381             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
382
383             /* Update vectorial force */
384             fix2             = _mm_macc_pd(dx20,fscal,fix2);
385             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
386             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
387             
388             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
389             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
390             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
391
392             gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
393
394             /* Inner loop uses 96 flops */
395         }
396
397         /* End of innermost loop */
398
399         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
400                                               f+i_coord_offset,fshift+i_shift_offset);
401
402         ggid                        = gid[iidx];
403         /* Update potential energies */
404         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
405
406         /* Increment number of inner iterations */
407         inneriter                  += j_index_end - j_index_start;
408
409         /* Outer loop uses 19 flops */
410     }
411
412     /* Increment number of outer iterations */
413     outeriter        += nri;
414
415     /* Update outer/inner flops */
416
417     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*96);
418 }
419 /*
420  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_double
421  * Electrostatics interaction: Coulomb
422  * VdW interaction:            None
423  * Geometry:                   Water3-Particle
424  * Calculate force/pot:        Force
425  */
426 void
427 nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_avx_128_fma_double
428                     (t_nblist * gmx_restrict                nlist,
429                      rvec * gmx_restrict                    xx,
430                      rvec * gmx_restrict                    ff,
431                      t_forcerec * gmx_restrict              fr,
432                      t_mdatoms * gmx_restrict               mdatoms,
433                      nb_kernel_data_t * gmx_restrict        kernel_data,
434                      t_nrnb * gmx_restrict                  nrnb)
435 {
436     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
437      * just 0 for non-waters.
438      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
439      * jnr indices corresponding to data put in the four positions in the SIMD register.
440      */
441     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
442     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
443     int              jnrA,jnrB;
444     int              j_coord_offsetA,j_coord_offsetB;
445     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
446     real             rcutoff_scalar;
447     real             *shiftvec,*fshift,*x,*f;
448     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
449     int              vdwioffset0;
450     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
451     int              vdwioffset1;
452     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
453     int              vdwioffset2;
454     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
455     int              vdwjidx0A,vdwjidx0B;
456     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
457     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
458     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
459     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
460     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
461     real             *charge;
462     __m128d          dummy_mask,cutoff_mask;
463     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
464     __m128d          one     = _mm_set1_pd(1.0);
465     __m128d          two     = _mm_set1_pd(2.0);
466     x                = xx[0];
467     f                = ff[0];
468
469     nri              = nlist->nri;
470     iinr             = nlist->iinr;
471     jindex           = nlist->jindex;
472     jjnr             = nlist->jjnr;
473     shiftidx         = nlist->shift;
474     gid              = nlist->gid;
475     shiftvec         = fr->shift_vec[0];
476     fshift           = fr->fshift[0];
477     facel            = _mm_set1_pd(fr->epsfac);
478     charge           = mdatoms->chargeA;
479
480     /* Setup water-specific parameters */
481     inr              = nlist->iinr[0];
482     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
483     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
484     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
485
486     /* Avoid stupid compiler warnings */
487     jnrA = jnrB = 0;
488     j_coord_offsetA = 0;
489     j_coord_offsetB = 0;
490
491     outeriter        = 0;
492     inneriter        = 0;
493
494     /* Start outer loop over neighborlists */
495     for(iidx=0; iidx<nri; iidx++)
496     {
497         /* Load shift vector for this list */
498         i_shift_offset   = DIM*shiftidx[iidx];
499
500         /* Load limits for loop over neighbors */
501         j_index_start    = jindex[iidx];
502         j_index_end      = jindex[iidx+1];
503
504         /* Get outer coordinate index */
505         inr              = iinr[iidx];
506         i_coord_offset   = DIM*inr;
507
508         /* Load i particle coords and add shift vector */
509         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
510                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
511
512         fix0             = _mm_setzero_pd();
513         fiy0             = _mm_setzero_pd();
514         fiz0             = _mm_setzero_pd();
515         fix1             = _mm_setzero_pd();
516         fiy1             = _mm_setzero_pd();
517         fiz1             = _mm_setzero_pd();
518         fix2             = _mm_setzero_pd();
519         fiy2             = _mm_setzero_pd();
520         fiz2             = _mm_setzero_pd();
521
522         /* Start inner kernel loop */
523         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
524         {
525
526             /* Get j neighbor index, and coordinate index */
527             jnrA             = jjnr[jidx];
528             jnrB             = jjnr[jidx+1];
529             j_coord_offsetA  = DIM*jnrA;
530             j_coord_offsetB  = DIM*jnrB;
531
532             /* load j atom coordinates */
533             gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
534                                               &jx0,&jy0,&jz0);
535
536             /* Calculate displacement vector */
537             dx00             = _mm_sub_pd(ix0,jx0);
538             dy00             = _mm_sub_pd(iy0,jy0);
539             dz00             = _mm_sub_pd(iz0,jz0);
540             dx10             = _mm_sub_pd(ix1,jx0);
541             dy10             = _mm_sub_pd(iy1,jy0);
542             dz10             = _mm_sub_pd(iz1,jz0);
543             dx20             = _mm_sub_pd(ix2,jx0);
544             dy20             = _mm_sub_pd(iy2,jy0);
545             dz20             = _mm_sub_pd(iz2,jz0);
546
547             /* Calculate squared distance and things based on it */
548             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
549             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
550             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
551
552             rinv00           = gmx_mm_invsqrt_pd(rsq00);
553             rinv10           = gmx_mm_invsqrt_pd(rsq10);
554             rinv20           = gmx_mm_invsqrt_pd(rsq20);
555
556             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
557             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
558             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
559
560             /* Load parameters for j particles */
561             jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
562
563             fjx0             = _mm_setzero_pd();
564             fjy0             = _mm_setzero_pd();
565             fjz0             = _mm_setzero_pd();
566
567             /**************************
568              * CALCULATE INTERACTIONS *
569              **************************/
570
571             /* Compute parameters for interactions between i and j atoms */
572             qq00             = _mm_mul_pd(iq0,jq0);
573
574             /* COULOMB ELECTROSTATICS */
575             velec            = _mm_mul_pd(qq00,rinv00);
576             felec            = _mm_mul_pd(velec,rinvsq00);
577
578             fscal            = felec;
579
580             /* Update vectorial force */
581             fix0             = _mm_macc_pd(dx00,fscal,fix0);
582             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
583             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
584             
585             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
586             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
587             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
588
589             /**************************
590              * CALCULATE INTERACTIONS *
591              **************************/
592
593             /* Compute parameters for interactions between i and j atoms */
594             qq10             = _mm_mul_pd(iq1,jq0);
595
596             /* COULOMB ELECTROSTATICS */
597             velec            = _mm_mul_pd(qq10,rinv10);
598             felec            = _mm_mul_pd(velec,rinvsq10);
599
600             fscal            = felec;
601
602             /* Update vectorial force */
603             fix1             = _mm_macc_pd(dx10,fscal,fix1);
604             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
605             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
606             
607             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
608             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
609             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
610
611             /**************************
612              * CALCULATE INTERACTIONS *
613              **************************/
614
615             /* Compute parameters for interactions between i and j atoms */
616             qq20             = _mm_mul_pd(iq2,jq0);
617
618             /* COULOMB ELECTROSTATICS */
619             velec            = _mm_mul_pd(qq20,rinv20);
620             felec            = _mm_mul_pd(velec,rinvsq20);
621
622             fscal            = felec;
623
624             /* Update vectorial force */
625             fix2             = _mm_macc_pd(dx20,fscal,fix2);
626             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
627             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
628             
629             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
630             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
631             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
632
633             gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
634
635             /* Inner loop uses 93 flops */
636         }
637
638         if(jidx<j_index_end)
639         {
640
641             jnrA             = jjnr[jidx];
642             j_coord_offsetA  = DIM*jnrA;
643
644             /* load j atom coordinates */
645             gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
646                                               &jx0,&jy0,&jz0);
647
648             /* Calculate displacement vector */
649             dx00             = _mm_sub_pd(ix0,jx0);
650             dy00             = _mm_sub_pd(iy0,jy0);
651             dz00             = _mm_sub_pd(iz0,jz0);
652             dx10             = _mm_sub_pd(ix1,jx0);
653             dy10             = _mm_sub_pd(iy1,jy0);
654             dz10             = _mm_sub_pd(iz1,jz0);
655             dx20             = _mm_sub_pd(ix2,jx0);
656             dy20             = _mm_sub_pd(iy2,jy0);
657             dz20             = _mm_sub_pd(iz2,jz0);
658
659             /* Calculate squared distance and things based on it */
660             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
661             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
662             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
663
664             rinv00           = gmx_mm_invsqrt_pd(rsq00);
665             rinv10           = gmx_mm_invsqrt_pd(rsq10);
666             rinv20           = gmx_mm_invsqrt_pd(rsq20);
667
668             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
669             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
670             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
671
672             /* Load parameters for j particles */
673             jq0              = _mm_load_sd(charge+jnrA+0);
674
675             fjx0             = _mm_setzero_pd();
676             fjy0             = _mm_setzero_pd();
677             fjz0             = _mm_setzero_pd();
678
679             /**************************
680              * CALCULATE INTERACTIONS *
681              **************************/
682
683             /* Compute parameters for interactions between i and j atoms */
684             qq00             = _mm_mul_pd(iq0,jq0);
685
686             /* COULOMB ELECTROSTATICS */
687             velec            = _mm_mul_pd(qq00,rinv00);
688             felec            = _mm_mul_pd(velec,rinvsq00);
689
690             fscal            = felec;
691
692             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
693
694             /* Update vectorial force */
695             fix0             = _mm_macc_pd(dx00,fscal,fix0);
696             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
697             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
698             
699             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
700             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
701             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
702
703             /**************************
704              * CALCULATE INTERACTIONS *
705              **************************/
706
707             /* Compute parameters for interactions between i and j atoms */
708             qq10             = _mm_mul_pd(iq1,jq0);
709
710             /* COULOMB ELECTROSTATICS */
711             velec            = _mm_mul_pd(qq10,rinv10);
712             felec            = _mm_mul_pd(velec,rinvsq10);
713
714             fscal            = felec;
715
716             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
717
718             /* Update vectorial force */
719             fix1             = _mm_macc_pd(dx10,fscal,fix1);
720             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
721             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
722             
723             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
724             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
725             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
726
727             /**************************
728              * CALCULATE INTERACTIONS *
729              **************************/
730
731             /* Compute parameters for interactions between i and j atoms */
732             qq20             = _mm_mul_pd(iq2,jq0);
733
734             /* COULOMB ELECTROSTATICS */
735             velec            = _mm_mul_pd(qq20,rinv20);
736             felec            = _mm_mul_pd(velec,rinvsq20);
737
738             fscal            = felec;
739
740             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
741
742             /* Update vectorial force */
743             fix2             = _mm_macc_pd(dx20,fscal,fix2);
744             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
745             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
746             
747             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
748             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
749             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
750
751             gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
752
753             /* Inner loop uses 93 flops */
754         }
755
756         /* End of innermost loop */
757
758         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
759                                               f+i_coord_offset,fshift+i_shift_offset);
760
761         /* Increment number of inner iterations */
762         inneriter                  += j_index_end - j_index_start;
763
764         /* Outer loop uses 18 flops */
765     }
766
767     /* Increment number of outer iterations */
768     outeriter        += nri;
769
770     /* Update outer/inner flops */
771
772     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*93);
773 }