Merge gromacs-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_bluegene / nb_kernel_w4_bluegene.h
1 #include "types/simple.h"
2
3 #undef FULL_INTERACTION
4 #undef FULL_INTERACTION_
5 #undef COUL_INTERACTION
6 #undef COUL_INTERACTION_
7 #undef VDW_INTERACTION
8 #undef VDW_INTERACTION_
9
10 #ifdef NO_FORCE
11
12  #define FULL_INTERACTION  nfcalc_interaction
13  #define FULL_INTERACTION_ nfcalc_interaction_
14  #define COUL_INTERACTION  nfcalc_coul_interaction
15  #define COUL_INTERACTION_ nfcalc_coul_interaction_
16  #define VDW_INTERACTION   nfcalc_vdw_interaction
17  #define VDW_INTERACTION_  nfcalc_vdw_interaction_
18
19 #else
20
21  #define FULL_INTERACTION  calc_interaction
22  #define FULL_INTERACTION_ calc_interaction_
23  #define COUL_INTERACTION  calc_coul_interaction
24  #define COUL_INTERACTION_ calc_coul_interaction_
25  #define VDW_INTERACTION   calc_vdw_interaction
26  #define VDW_INTERACTION_  calc_vdw_interaction_
27
28 #endif
29
30
31 void NB_KERNEL (
32                     int *    p_nri,
33                     int *    iinr,
34                     int *    jindex,
35                     int *    jjnr,
36                     int *    shift,
37                     real *   shiftvec,
38                     real *   fshift,
39                     int *    gid,
40                     real *   pos,
41                     real *   faction,
42                     real *   charge,
43                     real *   p_facel,
44                     real *   p_krf,
45                     real *   p_crf,
46                     real *   Vc,
47                     int *    type,
48                     int *    p_ntype,
49                     real *   vdwparam,
50                     real *   Vvdw,
51                     real *   p_tabscale,
52                     real *   VFtab,
53                     real *   invsqrta,
54                     real *   dvda,
55                     real *   p_gbtabscale,
56                     real *   GBtab,
57                     int *    p_nthreads,
58                     int *    count,
59                     void *   mtx,
60                     int *    outeriter,
61                     int *    inneriter,
62                     real *   work)
63 {
64     double _Complex qM,qH;
65     double _Complex zero  = __cmplx(0.0,0.0);
66     double _Complex rone  = __cmplx(1.0,0.0);
67     double _Complex three = __cmplx(3.0,3.0);
68     double conv1[2],conv2[2],conv3[2],conv4[2];
69
70     real            _qM,_qH,_facel,_tabscale,_gbtabscale,_krf,_crf;
71
72     int             nri,ntype,nthreads,n,ii,nj0,nj1,nti;
73
74 #pragma disjoint(*shiftvec,*fshift,*pos,*faction,*charge,*p_facel,*p_krf,*p_crf,*Vc, \
75                  *vdwparam,*Vvdw,*p_tabscale,*VFtab,*invsqrta,*dvda,*p_gbtabscale,*GBtab,*work)
76
77     nri              = *p_nri;         
78     ntype            = *p_ntype;       
79     nthreads         = *p_nthreads;    
80     _facel           = *p_facel;       
81 #if (COULOMB == COULOMB_TAB || VDW == VDW_TAB)
82     _tabscale        = *p_tabscale;
83 #else
84     _tabscale        = 0.0;
85 #endif
86 #if COULOMB == REACTION_FIELD
87     _krf             = *p_krf;
88     _crf             = *p_crf;
89 #else
90     _krf             = 0.0;
91     _crf             = 0.0;
92 #endif
93 #if COULOMB == GENERALIZED_BORN
94     _gbtabscale      = *p_gbtabscale;
95 #else
96     _gbtabscale      = 0.0;
97 #endif
98     ii               = iinr[0];        
99
100     _qH              = _facel * charge[ii+1];   
101     _qM              = _facel * charge[ii+3];
102     qM               = __cmplx(_qM,_qM);
103     qH               = __cmplx(_qH,_qH);
104 #if VDW == BUCKINGHAM
105     nti              = 3*ntype*type[ii];
106 #else
107     nti              = 2*ntype*type[ii];
108 #endif
109     
110     nj1 = 0;
111
112     for(n=0; (n<nri); n++)
113     {
114         double _Complex ix1,ix2,ix3,ix4,iy1,iy2,iy3,iy4,iz1,iz2,iz3,iz4,ix23,iy23,iz23,ix14,iy14,iz14;
115         double _Complex fix,fiy,fiz;
116         
117         // initialize potential energies and forces for this paricle
118
119         double _Complex vctot     = zero;              
120         double _Complex Vvdwtot   = zero;              
121         double _Complex fix1      = zero;
122         double _Complex fix2      = zero;
123         double _Complex fix3      = zero;
124         double _Complex fix4      = zero;
125         double _Complex fiy1      = zero;
126         double _Complex fiy2      = zero;
127         double _Complex fiy3      = zero;
128         double _Complex fiy4      = zero;
129         double _Complex fiz1      = zero;
130         double _Complex fiz2      = zero;
131         double _Complex fiz3      = zero;
132         double _Complex fiz4      = zero;
133
134         // shift is the position of the n-th water group
135
136         int is3  = 3*shift[n];
137         
138         // shiftvec is the center of a water group
139
140         real  _shX  = shiftvec[is3+0];  
141         real  _shY  = shiftvec[is3+1];
142         real  _shZ  = shiftvec[is3+2];
143
144         int ii  = iinr[n];        
145         int ii3 = 3*ii;
146         int k,ggid;
147
148         // add the shift vector to all water atoms
149
150         real  _ix1 = _shX + pos[ii3+0];
151         real  _iy1 = _shY + pos[ii3+1];
152         real  _iz1 = _shZ + pos[ii3+2];
153         real  _ix2 = _shX + pos[ii3+3];
154         real  _iy2 = _shY + pos[ii3+4];
155         real  _iz2 = _shZ + pos[ii3+5];
156         real  _ix3 = _shX + pos[ii3+6];
157         real  _iy3 = _shY + pos[ii3+7];
158         real  _iz3 = _shZ + pos[ii3+8];
159         real  _ix4 = _shX + pos[ii3+9];
160         real  _iy4 = _shY + pos[ii3+10];
161         real  _iz4 = _shZ + pos[ii3+11];
162
163         // clone all positions in complex variables
164         
165         ix1 = __cmplx(_ix1,_ix1);
166         iy1 = __cmplx(_iy1,_iy1);
167         iz1 = __cmplx(_iz1,_iz1);
168               
169         ix2 = __cmplx(_ix2,_ix2);
170         iy2 = __cmplx(_iy2,_iy2);
171         iz2 = __cmplx(_iz2,_iz2);
172
173         ix3 = __cmplx(_ix3,_ix3);
174         iy3 = __cmplx(_iy3,_iy3);
175         iz3 = __cmplx(_iz3,_iz3);
176
177         ix4 = __cmplx(_ix4,_ix4);
178         iy4 = __cmplx(_iy4,_iy4);
179         iz4 = __cmplx(_iz4,_iz4);
180
181         nj0 = jindex[n];
182         nj1 = jindex[n+1];
183
184         // unrolled twice for SIMDization
185
186         for(k=nj0; (k<nj1-1); k+=2)
187         {
188             double _Complex dx11,dx21,dx31,dx41,jx,fjx;
189             double _Complex dy11,dy21,dy31,dy41,jy,fjy;
190             double _Complex dz11,dz21,dz31,dz41,jz,fjz;
191             double _Complex rsq11,rsq21,rsq31,rsq41;
192             double _Complex rinv11,rinv21,rinv31,rinv41;
193             double _Complex rinvsq,krsq,fscal;
194             double _Complex rt,rti,r,n0,eps,Y,F,G,H,GHeps,VV,FF,fijC,fijD,qj,qq;
195             double _Complex c6,c12,cexp1,cexp2,Vvdwexp,Vvdw6,Vvdw12,rinvsix;
196
197             int nnn1,nnn2;
198             int jnr1,jnr2,j1,j2,tj1,tj2;
199
200             jnr1  = jjnr[k];
201             jnr2  = jjnr[k+1];
202
203             j1    = 3*jnr1;        
204             j2    = 3*jnr2;
205
206             load6_and_merge(pos,j1,j2,jx,jy,jz);
207
208             dx11   = __fpsub(ix1,jx);
209             dy11   = __fpsub(iy1,jy);
210             dz11   = __fpsub(iz1,jz);
211             dx21   = __fpsub(ix2,jx);
212             dy21   = __fpsub(iy2,jy);
213             dz21   = __fpsub(iz2,jz);
214             dx31   = __fpsub(ix3,jx);
215             dy31   = __fpsub(iy3,jy);
216             dz31   = __fpsub(iz3,jz);
217             dx41   = __fpsub(ix4,jx);
218             dy41   = __fpsub(iy4,jy);
219             dz41   = __fpsub(iz4,jz);
220
221             qj = __cmplx(charge[jnr1],charge[jnr2]);
222
223             rsq11  = vdist2(dx11,dy11,dz11);
224             rsq21  = vdist2(dx21,dy21,dz21);
225             rsq31  = vdist2(dx31,dy31,dz31);
226             rsq41  = vdist2(dx41,dy41,dz41);
227
228             rinv11 = __fprsqrte(rsq11);
229             rinv21 = __fprsqrte(rsq21);
230             rinv31 = __fprsqrte(rsq31);
231             rinv41 = __fprsqrte(rsq41);
232
233             rinv11 = sqrt_newton(rinv11,rsq11);
234             rinv21 = sqrt_newton(rinv21,rsq21);
235             rinv31 = sqrt_newton(rinv31,rsq31);
236             rinv41 = sqrt_newton(rinv41,rsq41);
237
238 #ifdef GMX_DOUBLE
239
240             rinv11 = sqrt_newton(rinv11,rsq11);
241             rinv21 = sqrt_newton(rinv21,rsq21);
242             rinv31 = sqrt_newton(rinv31,rsq31);
243             rinv41 = sqrt_newton(rinv41,rsq41);
244
245 #endif
246
247 #ifndef NO_FORCE
248             load6_and_merge(faction,j1,j2,fjx,fjy,fjz);
249 #endif
250
251             VDW_INTERACTION(rinv11,rsq11,conv1,jnr1,jnr2);
252
253             qq = __fpmul(qH,qj);
254
255 #ifndef NO_FORCE
256             fix1  = __fpnmsub(fix1,dx11,fscal);
257             fiy1  = __fpnmsub(fiy1,dy11,fscal);      
258             fiz1  = __fpnmsub(fiz1,dz11,fscal);
259
260             fjx   = __fpmadd(fjx,dx11,fscal);
261             fjy   = __fpmadd(fjy,dy11,fscal);
262             fjz   = __fpmadd(fjz,dz11,fscal);
263 #endif
264
265             COUL_INTERACTION(qq,rinv21,rsq21,conv2,jnr1,jnr2);
266
267 #ifndef NO_FORCE
268             fix2  = __fpnmsub(fix2,dx21,fscal);
269             fiy2  = __fpnmsub(fiy2,dy21,fscal);      
270             fiz2  = __fpnmsub(fiz2,dz21,fscal);
271
272             fjx   = __fpmadd(fjx,dx21,fscal);
273             fjy   = __fpmadd(fjy,dy21,fscal);
274             fjz   = __fpmadd(fjz,dz21,fscal);
275 #endif
276
277             COUL_INTERACTION(qq,rinv31,rsq31,conv3,jnr1,jnr2);
278
279             qq = __fpmul(qM,qj);
280
281 #ifndef NO_FORCE
282             fix3  = __fpnmsub(fix3,dx31,fscal);
283             fiy3  = __fpnmsub(fiy3,dy31,fscal);      
284             fiz3  = __fpnmsub(fiz3,dz31,fscal);
285
286             fjx   = __fpmadd(fjx,dx31,fscal);
287             fjy   = __fpmadd(fjy,dy31,fscal);
288             fjz   = __fpmadd(fjz,dz31,fscal);
289 #endif
290
291             COUL_INTERACTION(qq,rinv41,rsq41,conv4,jnr1,jnr2);
292
293 #ifndef NO_FORCE
294             fix4  = __fpnmsub(fix4,dx41,fscal);
295             fiy4  = __fpnmsub(fiy4,dy41,fscal);      
296             fiz4  = __fpnmsub(fiz4,dz41,fscal);
297
298             fjx   = __fpmadd(fjx,dx41,fscal);
299             fjy   = __fpmadd(fjy,dy41,fscal);
300             fjz   = __fpmadd(fjz,dz41,fscal);
301
302             split_and_store6(faction,j1,j2,fjx,fjy,fjz);
303 #endif
304         }
305
306         ix14 = __cmplx(_ix1,_ix4);
307         iy14 = __cmplx(_iy1,_iy4);
308         iz14 = __cmplx(_iz1,_iz4);
309
310         ix23 = __cmplx(_ix2,_ix3);
311         iy23 = __cmplx(_iy2,_iy3);
312         iz23 = __cmplx(_iz2,_iz3);
313
314         // actually we should not simdize the remainder loop, because it's bit slower
315
316         for(;(k<nj1); k++)
317         {
318             double _Complex dx23,dy23,dz23,dx14,dy14,dz14,jx,jy,jz;
319             double _Complex fjx,fjy,fjz,tx,ty,tz;
320             double _Complex rsq23,rinv23,rsq14,rinv14;
321             double _Complex rinvsq,krsq,fscal;
322             double _Complex rt,rti,r,eps,Y,F,G,H,GHeps,VV,FF,fijC,fijD,n0,qq;
323             
324             real _rinvsq,_krsq,_fscal;
325             real _rt,_r,_eps,_Y,_F,_H,_G,_GHeps,_VV,_FF,_fijC,_fijD,_n0,_qq;
326             real _qj,_c6,_c12,_cexp1,_cexp2,_Vvdwexp,_Vvdw6,_Vvdw12,_rinvsix;
327
328             int nnn1,nnn2;
329             int jnr,j3,tj;
330
331             jnr    = jjnr[k];
332             j3     = 3*jnr;
333
334             load3_and_clone(pos,j3,jx,jy,jz);
335
336             dx14     = __fpsub(ix14,jx);
337             dy14     = __fpsub(iy14,jy);
338             dz14     = __fpsub(iz14,jz);
339             dx23     = __fpsub(ix23,jx);
340             dy23     = __fpsub(iy23,jy);
341             dz23     = __fpsub(iz23,jz);
342
343             rsq14    = vdist2(dx14,dy14,dz14);
344             rsq23    = vdist2(dx23,dy23,dz23);
345
346             rinv14   = __fprsqrte(rsq14 );
347             rinv23   = __fprsqrte(rsq23 );
348
349             rinv14   = sqrt_newton(rinv14,rsq14);
350             rinv23   = sqrt_newton(rinv23,rsq23);
351
352 #ifdef GMX_DOUBLE
353
354             rinv14   = sqrt_newton(rinv14,rsq14);
355             rinv23   = sqrt_newton(rinv23,rsq23);
356
357 #endif
358
359             _qq       = _qM * charge[jnr];
360
361 #ifndef NO_FORCE
362             load3_real(faction,j3,fjx,fjy,fjz);
363 #endif
364
365             VDW_INTERACTION_(__creal(rinv14),__creal(rsq14),jnr);
366             fscal = _fscal;
367
368             COUL_INTERACTION_(_qq,__cimag(rinv14),__cimag(rsq14),jnr);
369             fscal = __cmplx(__creal(fscal),_fscal);
370
371             qq        = __fxpmul(qH,charge[jnr]);
372
373 #ifndef NO_FORCE
374
375             tx    = __fpmul(fscal,dx14);
376             ty    = __fpmul(fscal,dy14);
377             tz    = __fpmul(fscal,dz14);
378
379             fix1  = __fpnmsub(fix1,rone,tx);
380             fiy1  = __fpnmsub(fiy1,rone,ty);
381             fiz1  = __fpnmsub(fiz1,rone,tz);
382             fix4  = __fxnmsub(fix4,rone,tx);
383             fiy4  = __fxnmsub(fiy4,rone,ty);
384             fiz4  = __fxnmsub(fiz4,rone,tz);
385
386             fjx   = __fpadd(fjx,tx);
387             fjy   = __fpadd(fjy,ty);
388             fjz   = __fpadd(fjz,tz);
389 #endif
390
391             COUL_INTERACTION(qq,rinv23,rsq23,conv1,jnr,jnr);
392
393 #ifndef NO_FORCE
394
395             tx    = __fpmul(fscal,dx23);
396             ty    = __fpmul(fscal,dy23);
397             tz    = __fpmul(fscal,dz23);
398
399             fix2  = __fpnmsub(fix2,rone,tx);
400             fiy2  = __fpnmsub(fiy2,rone,ty);
401             fiz2  = __fpnmsub(fiz2,rone,tz);
402             fix3  = __fxnmsub(fix3,rone,tx);
403             fiy3  = __fxnmsub(fiy3,rone,ty);
404             fiz3  = __fxnmsub(fiz3,rone,tz);
405
406             fjx   = __fpadd(fjx,tx);
407             fjy   = __fpadd(fjy,ty);
408             fjz   = __fpadd(fjz,tz);
409
410             sum_and_store3(faction,j3,fjx,fjy,fjz);
411 #endif
412         }
413
414         ggid = gid[n];
415
416 #ifndef NO_FORCE
417         
418         sum_and_add3(faction,ii3  ,fix1,fiy1,fiz1);
419         sum_and_add3(faction,ii3+3,fix2,fiy2,fiz2);
420         sum_and_add3(faction,ii3+6,fix3,fiy3,fiz3);
421         sum_and_add3(faction,ii3+9,fix4,fiy4,fiz4);
422
423         fix = __fpadd(__fpadd(fix1,fix4),__fpadd(fix2,fix3));
424         fiy = __fpadd(__fpadd(fiy1,fiy4),__fpadd(fiy2,fiy3));
425         fiz = __fpadd(__fpadd(fiz1,fiz4),__fpadd(fiz2,fiz3));
426
427         sum_and_add3(fshift,is3,fix,fiy,fiz);
428 #endif
429
430 #if COULOMB != COULOMB_NONE
431         Vc[ggid]   += __creal(vctot)   + __cimag(vctot);
432 #endif
433
434 #if VDW != VDW_NONE
435         Vvdw[ggid] += __creal(Vvdwtot)   + __cimag(Vvdwtot);
436 #endif
437     }
438          
439     *outeriter       = nri;            
440     *inneriter       = nj1;            
441 }