Merge gromacs-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_bluegene / nb_kernel_w4w4_bluegene.h
1 #include "types/simple.h"
2
3 #undef FULL_INTERACTION
4 #undef FULL_INTERACTION_
5 #undef COUL_INTERACTION
6 #undef COUL_INTERACTION_
7 #undef VDW_INTERACTION
8 #undef VDW_INTERACTION_
9
10 #ifdef NO_FORCE
11
12  #define FULL_INTERACTION  nfcalc_interaction
13  #define FULL_INTERACTION_ nfcalc_interaction_
14  #define COUL_INTERACTION  nfcalc_coul_interaction
15  #define COUL_INTERACTION_ nfcalc_coul_interaction_
16  #define VDW_INTERACTION   nfcalc_vdw_interaction
17  #define VDW_INTERACTION_  nfcalc_vdw_interaction_
18
19 #else
20
21  #define FULL_INTERACTION  calc_interaction
22  #define FULL_INTERACTION_ calc_interaction_
23  #define COUL_INTERACTION  calc_coul_interaction
24  #define COUL_INTERACTION_ calc_coul_interaction_
25  #define VDW_INTERACTION   calc_vdw_interaction
26  #define VDW_INTERACTION_  calc_vdw_interaction_
27
28 #endif
29
30
31 void NB_KERNEL (
32
33                     int *   p_nri,
34                     int *   iinr,
35                     int *   jindex,
36                     int *   jjnr,
37                     int *   shift,
38                     real *  shiftvec,
39                     real *  fshift,
40                     int *   gid,
41                     real *  pos,
42                     real *  faction,
43                     real *  charge,
44                     real *  p_facel,
45                     real *  p_krf,
46                     real *  p_crf,
47                     real *  Vc,
48                     int *   type,
49                     int *   p_ntype,
50                     real *  vdwparam,
51                     real *  Vvdw,
52                     real *  p_tabscale,
53                     real *  VFtab,
54                     real *  invsqrta,
55                     real *  dvda,
56                     real *  p_gbtabscale,
57                     real *  GBtab,
58                     int *   p_nthreads,
59                     int *   count,
60                     void *  mtx,
61                     int *   outeriter,
62                     int *   inneriter,
63                     real *  work)
64 {
65     double _Complex qqMM_MM,qqMM_MH,qqMH_MH,qqMH_HH,qqHH_HH;
66     double _Complex zero  = __cmplx(0.0,0.0);
67     double _Complex one   = __cmplx(1.0,1.0);
68     double _Complex rone  = __cmplx(1.0,0.0);
69     double _Complex lr    = __cmplx(1.0,-1.0);
70     double _Complex rl    = __cmplx(-1.0,1.0);
71     double _Complex two   = __cmplx(2.0,2.0);
72     double _Complex three = __cmplx(3.0,3.0);
73
74     double conv1[2],conv2[2],conv3[2],conv4[2];
75
76     real  _qM,_qH,_qqMM,_qqMH,_qqHH;
77     real  _facel,_tabscale,_gbtabscale,_krf,_crf,_c6,_c12,_cexp1,_cexp2;
78
79     int  nri,nti,ntype,nthreads,n,ii,tj,nj0,nj1;
80
81 #pragma disjoint(*shiftvec,*fshift,*pos,*faction,*charge,*p_facel,*p_krf,*p_crf,*Vc, \
82                  *vdwparam,*Vvdw,*p_tabscale,*VFtab,*invsqrta,*dvda,*p_gbtabscale,*GBtab,*work)
83
84     nri              = *p_nri;         
85     ntype            = *p_ntype;       
86     nthreads         = *p_nthreads;    
87     _facel           = *p_facel;
88 #if (COULOMB == COULOMB_TAB || VDW == VDW_TAB)
89     _tabscale        = *p_tabscale;
90 #else
91     _tabscale        = 0.0;
92 #endif
93 #if COULOMB == REACTION_FIELD
94     _krf             = *p_krf;
95     _crf             = *p_crf;
96 #else
97     _krf             = 0.0;
98     _crf             = 0.0;
99 #endif
100 #if COULOMB == GENERALIZED_BORN
101     _gbtabscale      = *p_gbtabscale;
102 #else
103     _gbtabscale      = 0.0;
104 #endif
105
106     ii               = iinr[0];        
107
108     _qH              = charge[ii+1];   
109     _qM              = charge[ii+3];
110
111     _qqMM            = _facel*_qM*_qM;    
112     _qqMH            = _facel*_qM*_qH;
113     _qqHH            = _facel*_qH*_qH;    
114     qqMM_MM          = __cmplx(_qqMM,_qqMM);
115     qqMH_MH          = __cmplx(_qqMH,_qqMH);
116     qqMM_MH          = __cmplx(_qqMM,_qqMH);
117     qqMH_HH          = __cmplx(_qqMH,_qqHH);
118     qqHH_HH          = __cmplx(_qqHH,_qqHH);
119
120 #if VDW == LENNARD_JONES || VDW == VDW_TAB
121     tj               = 2*(ntype+1)*type[ii];
122     _c6              = vdwparam[tj];   
123     _c12             = vdwparam[tj+1]; 
124 #elif VDW == BUCKINGHAM
125     tj               = 3*(ntype+1)*type[ii];
126     _c6              = vdwparam[tj];
127     _cexp1           = vdwparam[tj+1];
128     _cexp2           = vdwparam[tj+2];
129 #endif
130
131     nj1              = 0;              
132     
133     for(n=0; (n<nri); n++)
134     {
135         double _Complex ix1,ix2,ix3,ix4,iy1,iy2,iy3,iy4,iz1,iz2,iz3,iz4,ix14,iy14,iz14,ix23,iy23,iz23;
136         
137         // initialize potential energies and forces for this paricle
138
139         double _Complex vctot     = zero;
140         double _Complex Vvdwtot   = zero;
141         double _Complex fix1      = zero;
142         double _Complex fix2      = zero;
143         double _Complex fix3      = zero;
144         double _Complex fix4      = zero;
145         double _Complex fiy1      = zero;
146         double _Complex fiy2      = zero;
147         double _Complex fiy3      = zero;
148         double _Complex fiy4      = zero;
149         double _Complex fiz1      = zero;
150         double _Complex fiz2      = zero;
151         double _Complex fiz3      = zero;
152         double _Complex fiz4      = zero;
153
154         // shift is the position of the n-th water group
155
156         int is3  = 3*shift[n];
157         
158         // shiftvec is the center of a water group
159
160         real _shX  = shiftvec[is3];  
161         real _shY  = shiftvec[is3+1];
162         real _shZ  = shiftvec[is3+2];
163
164         int ii  = iinr[n];        
165         int ii3 = 3*ii;
166         int k,ggid;
167
168         // add the shift vector to all water atoms
169
170         real _ix1 = _shX + pos[ii3+0];
171         real _iy1 = _shY + pos[ii3+1];
172         real _iz1 = _shZ + pos[ii3+2];
173         real _ix2 = _shX + pos[ii3+3];
174         real _iy2 = _shY + pos[ii3+4];
175         real _iz2 = _shZ + pos[ii3+5];
176         real _ix3 = _shX + pos[ii3+6];
177         real _iy3 = _shY + pos[ii3+7];
178         real _iz3 = _shZ + pos[ii3+8];
179         real _ix4 = _shX + pos[ii3+9];
180         real _iy4 = _shY + pos[ii3+10];
181         real _iz4 = _shZ + pos[ii3+11];
182
183         // clone all positions in complex variables
184         
185         ix1 = __cmplx(_ix1,_ix1);
186         iy1 = __cmplx(_iy1,_iy1);
187         iz1 = __cmplx(_iz1,_iz1);
188               
189         ix2 = __cmplx(_ix2,_ix2);
190         iy2 = __cmplx(_iy2,_iy2);
191         iz2 = __cmplx(_iz2,_iz2);
192
193         ix3 = __cmplx(_ix3,_ix3);
194         iy3 = __cmplx(_iy3,_iy3);
195         iz3 = __cmplx(_iz3,_iz3);
196
197         ix4 = __cmplx(_ix4,_ix4);
198         iy4 = __cmplx(_iy4,_iy4);
199         iz4 = __cmplx(_iz4,_iz4);
200
201         nj0 = jindex[n];      
202         nj1 = jindex[n+1];    
203
204         for(k=nj0; (k<nj1-1); k+=2)
205         {
206             double _Complex jx1,jx2,jx3,jx4,fjx1,fjx2,fjx3,fjx4,tx;
207             double _Complex dx11,dx12,dx13,dx22,dx23,dx24,dx32,dx33,dx34,dx42,dx43,dx44;
208             double _Complex jy1,jy2,jy3,jy4,fjy1,fjy2,fjy3,fjy4,ty;
209             double _Complex dy11,dy12,dy13,dy22,dy23,dy24,dy32,dy33,dy34,dy42,dy43,dy44;
210             double _Complex jz1,jz2,jz3,jz4,fjz1,fjz2,fjz3,fjz4,tz;
211             double _Complex dz11,dz12,dz13,dz22,dz23,dz24,dz32,dz33,dz34,dz42,dz43,dz44;
212             double _Complex rsq11,rsq12,rsq13,rsq22,rsq23,rsq24,rsq32,rsq33,rsq34,rsq42,rsq43,rsq44;
213             double _Complex rinv11,rinv12,rinv13,rinv22,rinv23,rinv24,rinv32,rinv33,rinv34,rinv42,rinv43,rinv44;
214             double _Complex rinvsq,krsq,gbscale,fscal;
215             double _Complex rinvsix,Vvdw6,Vvdw12,Vvdwexp;
216             double _Complex rt,r,rti,n0,eps,Y,F,G,H,GHeps,VV,FF,fijC,fijD;
217             
218             int nnn1,nnn2;
219             int jnr1,jnr2,j31,j32;
220
221             jnr1   = jjnr[k];
222             jnr2   = jjnr[k+1];
223
224             j31    = 3*jnr1;        
225             j32    = 3*jnr2;
226
227             // Coulomb and possibly VdW between i-water and Oxygens of j1- and j2-water
228
229             load6_and_merge(pos,j31  ,j32  ,jx1,jy1,jz1);
230             load6_and_merge(pos,j31+9,j32+9,jx4,jy4,jz4);
231             
232             /*
233             a1 = __lfpd(&pos[j31]);
234             a2 = __lfxd(&pos[j32]);
235             b1 = __lfpd(&pos[j31+2]);
236             b2 = __lfxd(&pos[j32+2]);
237             c1 = __lfpd(&pos[j31+4]);
238             c2 = __lfxd(&pos[j32+4]);
239             
240             jx1 = __fpsel(a1,a2,rone);
241             jx2 = __fpsel(a1,a2,cone);
242             jy1 = __fpsel(b1,b2,rone);
243             jy2 = __fpsel(b1,b2,cone);
244             jz1 = __fpsel(c1,c2,rone);
245             jz2 = __fpsel(c1,c2,cone);
246             */
247
248             dx11   = __fpsub(ix1,jx1);
249             dy11   = __fpsub(iy1,jy1);
250             dz11   = __fpsub(iz1,jz1);
251
252             dx24   = __fpsub(ix2,jx4);
253             dy24   = __fpsub(iy2,jy4);
254             dz24   = __fpsub(iz2,jz4);
255
256             dx34   = __fpsub(ix3,jx4);
257             dy34   = __fpsub(iy3,jy4);
258             dz34   = __fpsub(iz3,jz4);
259
260             dx44   = __fpsub(ix4,jx4);
261             dy44   = __fpsub(iy4,jy4);
262             dz44   = __fpsub(iz4,jz4);
263
264             rsq11  = vdist2(dx11,dy11,dz11);
265             rsq24  = vdist2(dx24,dy24,dz24);
266             rsq34  = vdist2(dx34,dy34,dz34);
267             rsq44  = vdist2(dx44,dy44,dz44);
268
269             rinv11 = __fprsqrte(rsq11);
270             rinv24 = __fprsqrte(rsq24);
271             rinv34 = __fprsqrte(rsq34);
272             rinv44 = __fprsqrte(rsq44);
273
274             rinv11 = sqrt_newton(rinv11,rsq11);
275             rinv24 = sqrt_newton(rinv24,rsq24);
276             rinv34 = sqrt_newton(rinv34,rsq34);
277             rinv44 = sqrt_newton(rinv44,rsq44);
278
279 #ifdef GMX_DOUBLE
280
281             rinv11 = sqrt_newton(rinv11,rsq11);
282             rinv24 = sqrt_newton(rinv24,rsq24);
283             rinv34 = sqrt_newton(rinv34,rsq34);
284             rinv44 = sqrt_newton(rinv44,rsq44);
285
286 #endif
287
288             // only the oxygens will have a VDW-interaction
289
290 #ifndef NO_FORCE
291             load6_and_merge(faction,j31,j32,fjx1,fjy1,fjz1);
292 #endif
293
294             VDW_INTERACTION(rinv11,rsq11,conv1,jnr1,jnr2);
295
296 #ifndef NO_FORCE
297             fix1   = __fpnmsub(fix1,dx11,fscal);
298             fiy1   = __fpnmsub(fiy1,dy11,fscal);      
299             fiz1   = __fpnmsub(fiz1,dz11,fscal);
300
301             fjx1   = __fpmadd(fjx1,dx11,fscal);
302             fjy1   = __fpmadd(fjy1,dy11,fscal);
303             fjz1   = __fpmadd(fjz1,dz11,fscal);
304 #endif
305
306             COUL_INTERACTION(qqMM_MM,rinv44,rsq44,conv4,jnr1,jnr2);
307
308 #ifndef NO_FORCE
309             load6_and_merge (faction,j31+9,j32+9,fjx4,fjy4,fjz4);
310             split_and_store6(faction,j31  ,j32  ,fjx1,fjy1,fjz1);
311
312             fix4   = __fpnmsub(fix4,dx44,fscal);
313             fiy4   = __fpnmsub(fiy4,dy44,fscal);      
314             fiz4   = __fpnmsub(fiz4,dz44,fscal);
315
316             fjx4   = __fpmadd(fjx4,dx44,fscal);
317             fjy4   = __fpmadd(fjy4,dy44,fscal);
318             fjz4   = __fpmadd(fjz4,dz44,fscal);
319 #endif
320
321             COUL_INTERACTION(qqMH_MH,rinv24,rsq24,conv2,jnr1,jnr2);
322
323 #ifndef NO_FORCE
324             fix2   = __fpnmsub(fix2,dx24,fscal);
325             fiy2   = __fpnmsub(fiy2,dy24,fscal);      
326             fiz2   = __fpnmsub(fiz2,dz24,fscal);
327
328             fjx4   = __fpmadd(fjx4,dx24,fscal);
329             fjy4   = __fpmadd(fjy4,dy24,fscal);
330             fjz4   = __fpmadd(fjz4,dz24,fscal);
331 #endif
332
333             COUL_INTERACTION(qqMH_MH,rinv34,rsq34,conv3,jnr1,jnr2);
334
335 #ifndef NO_FORCE
336             fix3   = __fpnmsub(fix3,dx34,fscal);
337             fiy3   = __fpnmsub(fiy3,dy34,fscal);      
338             fiz3   = __fpnmsub(fiz3,dz34,fscal);
339
340             fjx4   = __fpmadd(fjx4,dx34,fscal);
341             fjy4   = __fpmadd(fjy4,dy34,fscal);
342             fjz4   = __fpmadd(fjz4,dz34,fscal);
343
344             split_and_store6(faction,j31+9,j32+9,fjx4,fjy4,fjz4);
345 #endif
346             
347             // Coulomb between i-water and first hydrogen in j1- and j2-water
348
349             load6_and_merge(pos,j31+3,j32+3,jx2,jy2,jz2);
350
351             dx22   = __fpsub(ix2,jx2);
352             dy22   = __fpsub(iy2,jy2);
353             dz22   = __fpsub(iz2,jz2);
354             dx32   = __fpsub(ix3,jx2);
355             dy32   = __fpsub(iy3,jy2);
356             dz32   = __fpsub(iz3,jz2);
357             dx42   = __fpsub(ix4,jx2);
358             dy42   = __fpsub(iy4,jy2);
359             dz42   = __fpsub(iz4,jz2);
360
361             rsq22  = vdist2(dx22,dy22,dz22);
362             rsq32  = vdist2(dx32,dy32,dz32);
363             rsq42  = vdist2(dx42,dy42,dz42);
364
365             rinv22 = __fprsqrte(rsq22);
366             rinv32 = __fprsqrte(rsq32);
367             rinv42 = __fprsqrte(rsq42);
368
369             rinv22 = sqrt_newton(rinv22,rsq22);
370             rinv32 = sqrt_newton(rinv32,rsq32);
371             rinv42 = sqrt_newton(rinv42,rsq42);
372
373 #ifdef GMX_DOUBLE
374
375             rinv22 = sqrt_newton(rinv22,rsq22);
376             rinv32 = sqrt_newton(rinv32,rsq32);
377             rinv42 = sqrt_newton(rinv42,rsq42);
378
379 #endif
380
381 #ifndef NO_FORCE
382             load6_and_merge(faction,j31+3,j32+3,fjx2,fjy2,fjz2);
383 #endif
384
385             COUL_INTERACTION(qqHH_HH,rinv22,rsq22,conv2,jnr1,jnr2);
386
387 #ifndef NO_FORCE
388             fix2   = __fpnmsub(fix2,dx22,fscal);
389             fiy2   = __fpnmsub(fiy2,dy22,fscal);      
390             fiz2   = __fpnmsub(fiz2,dz22,fscal);
391
392             fjx2   = __fpmadd(fjx2,dx22,fscal);
393             fjy2   = __fpmadd(fjy2,dy22,fscal);
394             fjz2   = __fpmadd(fjz2,dz22,fscal);
395 #endif
396
397             COUL_INTERACTION(qqHH_HH,rinv32,rsq32,conv3,jnr1,jnr2);
398
399 #ifndef NO_FORCE
400             fix3   = __fpnmsub(fix3,dx32,fscal);
401             fiy3   = __fpnmsub(fiy3,dy32,fscal);      
402             fiz3   = __fpnmsub(fiz3,dz32,fscal);
403
404             fjx2   = __fpmadd(fjx2,dx32,fscal);
405             fjy2   = __fpmadd(fjy2,dy32,fscal);
406             fjz2   = __fpmadd(fjz2,dz32,fscal);
407 #endif
408
409             COUL_INTERACTION(qqMH_MH,rinv42,rsq42,conv4,jnr1,jnr2);
410
411 #ifndef NO_FORCE
412             fix4   = __fpnmsub(fix4,dx42,fscal);
413             fiy4   = __fpnmsub(fiy4,dy42,fscal);      
414             fiz4   = __fpnmsub(fiz4,dz42,fscal);
415
416             fjx2   = __fpmadd(fjx2,dx42,fscal);
417             fjy2   = __fpmadd(fjy2,dy42,fscal);
418             fjz2   = __fpmadd(fjz2,dz42,fscal);
419
420             split_and_store6(faction,j31+3,j32+3,fjx2,fjy2,fjz2);
421 #endif
422
423             // Coulomb between i-water and second hydrogen in j1- and j2-water
424
425             load6_and_merge(pos,j31+6,j32+6,jx3,jy3,jz3);
426
427             dx43   = __fpsub(ix4,jx3);
428             dy43   = __fpsub(iy4,jy3);
429             dz43   = __fpsub(iz4,jz3);
430
431             dx23   = __fpsub(ix2,jx3);
432             dy23   = __fpsub(iy2,jy3);
433             dz23   = __fpsub(iz2,jz3);
434
435             dx33   = __fpsub(ix3,jx3);
436             dy33   = __fpsub(iy3,jy3);
437             dz33   = __fpsub(iz3,jz3);
438
439             rsq43  = vdist2(dx43,dy43,dz43);
440             rsq23  = vdist2(dx23,dy23,dz23);
441             rsq33  = vdist2(dx33,dy33,dz33);
442
443             rinv43 = __fprsqrte(rsq43);
444             rinv23 = __fprsqrte(rsq23);
445             rinv33 = __fprsqrte(rsq33);
446
447             rinv43 = sqrt_newton(rinv43,rsq43);
448             rinv23 = sqrt_newton(rinv23,rsq23);
449             rinv33 = sqrt_newton(rinv33,rsq33);
450
451 #ifdef GMX_DOUBLE
452
453             rinv43 = sqrt_newton(rinv43,rsq43);
454             rinv23 = sqrt_newton(rinv23,rsq23);
455             rinv33 = sqrt_newton(rinv33,rsq33);
456
457 #endif
458
459 #ifndef NO_FORCE
460             load6_and_merge(faction,j31+6,j32+6,fjx3,fjy3,fjz3);
461 #endif
462
463             COUL_INTERACTION(qqMH_MH,rinv43,rsq43,conv1,jnr1,jnr2);
464
465 #ifndef NO_FORCE
466             fix4   = __fpnmsub(fix4,dx43,fscal);
467             fiy4   = __fpnmsub(fiy4,dy43,fscal);      
468             fiz4   = __fpnmsub(fiz4,dz43,fscal);
469
470             fjx3   = __fpmadd(fjx3,dx43,fscal);
471             fjy3   = __fpmadd(fjy3,dy43,fscal);
472             fjz3   = __fpmadd(fjz3,dz43,fscal);
473 #endif
474
475             COUL_INTERACTION(qqHH_HH,rinv23,rsq23,conv2,jnr1,jnr2);
476
477 #ifndef NO_FORCE
478             split_and_store6(faction,j31+9,j32+9,fjx4,fjy4,fjz4);
479
480             fix2   = __fpnmsub(fix2,dx23,fscal);
481             fiy2   = __fpnmsub(fiy2,dy23,fscal);      
482             fiz2   = __fpnmsub(fiz2,dz23,fscal);
483
484             fjx3   = __fpmadd(fjx3,dx23,fscal);
485             fjy3   = __fpmadd(fjy3,dy23,fscal);
486             fjz3   = __fpmadd(fjz3,dz23,fscal);
487 #endif
488
489             COUL_INTERACTION(qqHH_HH,rinv33,rsq33,conv3,jnr1,jnr2);
490
491 #ifndef NO_FORCE
492             fix3   = __fpnmsub(fix3,dx33,fscal);
493             fiy3   = __fpnmsub(fiy3,dy33,fscal);      
494             fiz3   = __fpnmsub(fiz3,dz33,fscal);
495
496             fjx3   = __fpmadd(fjx3,dx33,fscal);
497             fjy3   = __fpmadd(fjy3,dy33,fscal);
498             fjz3   = __fpmadd(fjz3,dz33,fscal);
499
500             split_and_store6(faction,j31+6,j32+6,fjx3,fjy3,fjz3);
501 #endif
502         }
503
504         ix14 = __cmplx(_ix1,_ix4);
505         iy14 = __cmplx(_iy1,_iy4);
506         iz14 = __cmplx(_iz1,_iz4);
507         ix23 = __cmplx(_ix2,_ix3);
508         iy23 = __cmplx(_iy2,_iy3);
509         iz23 = __cmplx(_iz2,_iz3);
510         
511         for(; (k<nj1); k++)
512         {
513             double _Complex dx14,dx423,dx223,dx323,dx234,tx;
514             double _Complex dy14,dy423,dy223,dy323,dy234,ty;
515             double _Complex dz14,dz423,dz223,dz323,dz234,tz;
516             double _Complex jx23,jy23,jz23,jx14,jy14,jz14;
517             double _Complex fjx23,fjy23,fjz23,fjx14,fjy14,fjz14;
518             double _Complex rsq14,rsq423,rsq223,rsq323,rsq234;
519             double _Complex rinv14,rinv423,rinv223,rinv323,rinv234;
520             double _Complex rinvsq,krsq,fscal;
521             double _Complex rt,r,rti,eps,YF1,YF2,GH1,GH2,Y,F,G,H,GHeps,VV,FF,fijC,fijD,n0;
522             real _dx11,_dy11,_dz11,_rsq11,_rinv11;
523             real _rinvsq,_krsq,_fscal;
524             real _rt,_r,_eps,_Y,_F,_G,_H,_GHeps,_VV,_FF,_fijC,_fijD,_n0;
525             real _rinvsix,_Vvdw6,_Vvdw12,_Vvdwexp;
526
527             int nnn1,nnn2;
528             int jnr,j3;
529
530             jnr    = jjnr[k];
531             j3     = 3*jnr;
532
533             load6_and_merge(pos,j3,j3+9,jx14,jy14,jz14);
534
535             dx14    = __fpsub(ix14,jx14);
536             dy14    = __fpsub(iy14,jy14);
537             dz14    = __fpsub(iz14,jz14);
538
539             dx234   = __fxcpnmsub(ix23,one,__cimag(jx14));
540             dy234   = __fxcpnmsub(iy23,one,__cimag(jy14));
541             dz234   = __fxcpnmsub(iz23,one,__cimag(jz14));
542
543             rsq14   = vdist2(dx14,dy14,dz14);
544             rsq234  = vdist2(dx234,dy234,dz234);
545
546             rinv14  = __fprsqrte(rsq14);
547             rinv234 = __fprsqrte(rsq234);
548
549             rinv14  = sqrt_newton(rinv14,rsq14);
550             rinv234 = sqrt_newton(rinv234,rsq234);
551
552 #ifdef GMX_DOUBLE
553
554             rinv14  = sqrt_newton(rinv14,rsq14);
555             rinv234 = sqrt_newton(rinv234,rsq234);
556
557 #endif
558
559             VDW_INTERACTION_(__creal(rinv14),__creal(rsq14),jnr);
560             fscal = _fscal;
561             COUL_INTERACTION_(_qqMM,__cimag(rinv14),__cimag(rsq14),jnr);
562             fscal = __cmplx(__creal(fscal),_fscal);
563
564 #ifndef NO_FORCE
565
566             load6_and_merge(faction,j3,j3+9,fjx14,fjy14,fjz14);
567
568             tx = __fpmul(dx14,fscal);
569             ty = __fpmul(dy14,fscal);
570             tz = __fpmul(dz14,fscal);
571
572             fix1   -= __creal(tx);
573             fiy1   -= __creal(ty);
574             fiz1   -= __creal(tz);
575             fix4   -= __cimag(tx);
576             fiy4   -= __cimag(ty);
577             fiz4   -= __cimag(tz);
578
579             fjx14   = __fpadd(fjx14,tx);
580             fjy14   = __fpadd(fjy14,ty);
581             fjz14   = __fpadd(fjz14,tz);
582
583 #endif
584
585             COUL_INTERACTION(qqMH_MH,rinv234,rsq234,conv4,jnr,jnr);
586
587             load6_and_merge(pos,j3+3,j3+6,jx23,jy23,jz23);
588
589 #ifndef NO_FORCE
590             tx     = __fpmul(dx234,fscal);
591             ty     = __fpmul(dy234,fscal);
592             tz     = __fpmul(dz234,fscal);
593
594             fix2    = __fpnmsub(fix2,rone,tx);      
595             fiy2    = __fpnmsub(fiy2,rone,ty); 
596             fiz2    = __fpnmsub(fiz2,rone,tz);
597             fix3    = __fxnmsub(fix3,rone,tx);      
598             fiy3    = __fxnmsub(fiy3,rone,ty); 
599             fiz3    = __fxnmsub(fiz3,rone,tz);
600
601             tx      = __fxcxma(tx,tx,rone);
602             ty      = __fxcxma(ty,ty,rone);
603             tz      = __fxcxma(tz,tz,rone);
604
605             fjx14  = __fxmadd(fjx14,rone,tx);
606             fjy14  = __fxmadd(fjy14,rone,ty);
607             fjz14  = __fxmadd(fjz14,rone,tz);
608
609             split_and_store6(faction,j3,j3+9,fjx14,fjy14,fjz14);
610 #endif
611
612             dx423   = __fpsub(ix4,jx23);
613             dy423   = __fpsub(iy4,jy23);
614             dz423   = __fpsub(iz4,jz23);
615
616             dx223   = __fpsub(ix2,jx23);
617             dy223   = __fpsub(iy2,jy23);
618             dz223   = __fpsub(iz2,jz23);
619
620             dx323   = __fpsub(ix3,jx23);
621             dy323   = __fpsub(iy3,jy23);
622             dz323   = __fpsub(iz3,jz23);
623
624             rsq423  = vdist2(dx423,dy423,dz423);
625             rsq223  = vdist2(dx223,dy223,dz223);
626             rsq323  = vdist2(dx323,dy323,dz323);
627
628             rinv423 = __fprsqrte(rsq423);
629             rinv223 = __fprsqrte(rsq223);
630             rinv323 = __fprsqrte(rsq323);
631
632             rinv423 = sqrt_newton(rinv423,rsq423);
633             rinv223 = sqrt_newton(rinv223,rsq223);
634             rinv323 = sqrt_newton(rinv323,rsq323);
635
636 #ifdef GMX_DOUBLE
637
638             rinv423 = sqrt_newton(rinv423,rsq423);
639             rinv223 = sqrt_newton(rinv223,rsq223);
640             rinv323 = sqrt_newton(rinv323,rsq323);
641
642 #endif
643             COUL_INTERACTION(qqMH_MH,rinv423,rsq423,conv1,jnr,jnr);
644
645 #ifndef NO_FORCE
646             load6_and_merge(faction,j3+3,j3+6,fjx23,fjy23,fjz23);
647
648             fix4   = __fpnmsub(fix4,dx423,fscal);
649             fiy4   = __fpnmsub(fiy4,dy423,fscal);      
650             fiz4   = __fpnmsub(fiz4,dz423,fscal);
651
652             fjx23  = __fpmadd(fjx23,dx423,fscal);
653             fjy23  = __fpmadd(fjy23,dy423,fscal);
654             fjz23  = __fpmadd(fjz23,dz423,fscal);
655 #endif
656
657             COUL_INTERACTION(qqHH_HH,rinv223,rsq223,conv2,jnr,jnr);
658
659 #ifndef NO_FORCE
660             fix2   = __fpnmsub(fix2,dx223,fscal);
661             fiy2   = __fpnmsub(fiy2,dy223,fscal);      
662             fiz2   = __fpnmsub(fiz2,dz223,fscal);
663
664             fjx23  = __fpmadd(fjx23,dx223,fscal);
665             fjy23  = __fpmadd(fjy23,dy223,fscal);
666             fjz23  = __fpmadd(fjz23,dz223,fscal);
667 #endif
668
669             COUL_INTERACTION(qqHH_HH,rinv323,rsq323,conv3,jnr,jnr);
670
671 #ifndef NO_FORCE
672             fix3   = __fpnmsub(fix3,dx323,fscal);
673             fiy3   = __fpnmsub(fiy3,dy323,fscal);      
674             fiz3   = __fpnmsub(fiz3,dz323,fscal);
675
676             fjx23  = __fpmadd(fjx23,dx323,fscal);
677             fjy23  = __fpmadd(fjy23,dy323,fscal);
678             fjz23  = __fpmadd(fjz23,dz323,fscal);
679
680             split_and_store6(faction,j3+3,j3+6,fjx23,fjy23,fjz23);
681 #endif
682         }
683
684         ggid = gid[n];
685
686 #ifndef NO_FORCE
687         sum_and_add3(faction,ii3  ,fix1,fiy1,fiz1);
688         sum_and_add3(faction,ii3+3,fix2,fiy2,fiz2);
689         sum_and_add3(faction,ii3+6,fix3,fiy3,fiz3);
690         sum_and_add3(faction,ii3+9,fix4,fiy4,fiz4);
691
692         fshift[is3]      = fshift[is3]
693                          + (__creal(fix1) + __cimag(fix1)) 
694                          + (__creal(fix2) + __cimag(fix2))
695                          + (__creal(fix3) + __cimag(fix3))
696                          + (__creal(fix4) + __cimag(fix4));
697         fshift[is3+1]    = fshift[is3+1]
698                          + (__creal(fiy1) + __cimag(fiy1)) 
699                          + (__creal(fiy2) + __cimag(fiy2))
700                          + (__creal(fiy3) + __cimag(fiy3))
701                          + (__creal(fiy4) + __cimag(fiy4));
702         fshift[is3+2]    = fshift[is3+2]
703                          + (__creal(fiz1) + __cimag(fiz1)) 
704                          + (__creal(fiz2) + __cimag(fiz2))
705                          + (__creal(fiz3) + __cimag(fiz3))
706                          + (__creal(fiz4) + __cimag(fiz4));
707 #endif
708
709 #if COULOMB != COULOMB_NONE
710         Vc[ggid]         += __creal(vctot)   + __cimag(vctot);
711 #endif
712
713 #if VDW != VDW_NONE
714         Vvdw[ggid]       += __creal(Vvdwtot) + __cimag(Vvdwtot);
715 #endif
716     }
717     
718     *outeriter       = nri;            
719     *inneriter       = nj1;            
720 }