Merge gromacs-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_bluegene / nb_kernel_w3_bluegene.h
1 #include "types/simple.h"
2
3 #undef FULL_INTERACTION
4 #undef FULL_INTERACTION_
5 #undef COUL_INTERACTION
6 #undef COUL_INTERACTION_
7
8 #ifdef NO_FORCE
9
10  #define FULL_INTERACTION  nfcalc_interaction
11  #define FULL_INTERACTION_ nfcalc_interaction_
12  #define COUL_INTERACTION  nfcalc_coul_interaction
13  #define COUL_INTERACTION_ nfcalc_coul_interaction_
14
15 #else
16
17  #define FULL_INTERACTION  calc_interaction
18  #define FULL_INTERACTION_ calc_interaction_
19  #define COUL_INTERACTION  calc_coul_interaction
20  #define COUL_INTERACTION_ calc_coul_interaction_
21
22 #endif
23
24
25 void NB_KERNEL (
26                     int *    p_nri,
27                     int *    iinr,
28                     int *    jindex,
29                     int *    jjnr,
30                     int *    shift,
31                     real *   shiftvec,
32                     real *   fshift,
33                     int *    gid,
34                     real *   pos,
35                     real *   faction,
36                     real *   charge,
37                     real *   p_facel,
38                     real *   p_krf,
39                     real *   p_crf,
40                     real *   Vc,
41                     int *    type,
42                     int *    p_ntype,
43                     real *   vdwparam,
44                     real *   Vvdw,
45                     real *   p_tabscale,
46                     real *   VFtab,
47                     real *   invsqrta,
48                     real *   dvda,
49                     real *   p_gbtabscale,
50                     real *   GBtab,
51                     int *    p_nthreads,
52                     int *    count,
53                     void *   mtx,
54                     int *    outeriter,
55                     int *    inneriter,
56                     real *   work)
57 {
58     double _Complex qO,qH;
59     double _Complex zero  = __cmplx(0.0,0.0);
60     double _Complex rone  = __cmplx(1.0,0.0);
61     double _Complex three = __cmplx(3.0,3.0);
62     double conv1[2],conv2[2],conv3[2];
63
64     real            _qO,_qH,_facel,_tabscale,_gbtabscale,_krf,_crf;
65
66     int             nri,ntype,nthreads,n,ii,nj0,nj1,nti;
67
68 #pragma disjoint(*shiftvec,*fshift,*pos,*faction,*charge,*p_facel,*p_krf,*p_crf,*Vc, \
69                  *vdwparam,*Vvdw,*p_tabscale,*VFtab,*invsqrta,*dvda,*p_gbtabscale,*GBtab,*work)
70
71     nri              = *p_nri;         
72     ntype            = *p_ntype;       
73     nthreads         = *p_nthreads;    
74     _facel           = *p_facel;       
75 #if (COULOMB == COULOMB_TAB || VDW == VDW_TAB)
76     _tabscale        = *p_tabscale;
77 #else
78     _tabscale        = 0.0;
79 #endif
80 #if COULOMB == REACTION_FIELD
81     _krf             = *p_krf;
82     _crf             = *p_crf;
83 #else
84     _krf             = 0.0;
85     _crf             = 0.0;
86 #endif
87 #if COULOMB == GENERALIZED_BORN
88     _gbtabscale      = *p_gbtabscale;
89 #else
90     _gbtabscale      = 0.0;
91 #endif
92     ii               = iinr[0];        
93
94     _qO              = _facel * charge[ii];     
95     _qH              = _facel * charge[ii+1];   
96     qO               = __cmplx(_qO,_qO);
97     qH               = __cmplx(_qH,_qH);
98 #if VDW == BUCKINGHAM
99     nti              = 3*ntype*type[ii];
100 #else
101     nti              = 2*ntype*type[ii];
102 #endif
103     
104     nj1 = 0;
105
106     for(n=0; (n<nri); n++)
107     {
108         double _Complex ix1,ix2,ix3,iy1,iy2,iy3,iz1,iz2,iz3,ix23,iy23,iz23,fix,fiy,fiz;
109         
110         // initialize potential energies and forces for this paricle
111
112         double _Complex vctot     = zero;              
113         double _Complex Vvdwtot   = zero;              
114         double _Complex fix1      = zero;
115         double _Complex fix2      = zero;
116         double _Complex fix3      = zero;
117         double _Complex fiy1      = zero;
118         double _Complex fiy2      = zero;
119         double _Complex fiy3      = zero;
120         double _Complex fiz1      = zero;
121         double _Complex fiz2      = zero;
122         double _Complex fiz3      = zero;
123
124         // shift is the position of the n-th water group
125
126         int is3  = 3*shift[n];
127         
128         // shiftvec is the center of a water group
129
130         real  _shX  = shiftvec[is3+0];  
131         real  _shY  = shiftvec[is3+1];
132         real  _shZ  = shiftvec[is3+2];
133
134         int ii  = iinr[n];        
135         int ii3 = 3*ii;
136         int k,ggid;
137
138         // add the shift vector to all water atoms
139
140         real  _ix1 = _shX + pos[ii3+0];
141         real  _iy1 = _shY + pos[ii3+1];
142         real  _iz1 = _shZ + pos[ii3+2];
143         real  _ix2 = _shX + pos[ii3+3];
144         real  _iy2 = _shY + pos[ii3+4];
145         real  _iz2 = _shZ + pos[ii3+5];
146         real  _ix3 = _shX + pos[ii3+6];
147         real  _iy3 = _shY + pos[ii3+7];
148         real  _iz3 = _shZ + pos[ii3+8];
149
150         // clone all positions in complex variables
151         
152         ix1 = __cmplx(_ix1,_ix1);
153         iy1 = __cmplx(_iy1,_iy1);
154         iz1 = __cmplx(_iz1,_iz1);
155               
156         ix2 = __cmplx(_ix2,_ix2);
157         iy2 = __cmplx(_iy2,_iy2);
158         iz2 = __cmplx(_iz2,_iz2);
159
160         ix3 = __cmplx(_ix3,_ix3);
161         iy3 = __cmplx(_iy3,_iy3);
162         iz3 = __cmplx(_iz3,_iz3);
163
164         nj0 = jindex[n];
165         nj1 = jindex[n+1];
166
167         // unrolled twice for SIMDization
168
169         for(k=nj0; (k<nj1-1); k+=2)
170         {
171             double _Complex dx11,dx21,dx31,jx,fjx;
172             double _Complex dy11,dy21,dy31,jy,fjy;
173             double _Complex dz11,dz21,dz31,jz,fjz;
174             double _Complex rsq11,rsq21,rsq31;
175             double _Complex rinv11,rinv21,rinv31;
176             double _Complex rinvsq,krsq,fscal;
177             double _Complex rt,rti,r,n0,eps,Y,F,G,H,GHeps,VV,FF,fijC,fijD,qj,qq;
178             double _Complex c6,c12,cexp1,cexp2,Vvdwexp,Vvdw6,Vvdw12,rinvsix;
179
180             int nnn1,nnn2;
181             int jnr1,jnr2,j1,j2,tj1,tj2;
182
183             jnr1  = jjnr[k];
184             jnr2  = jjnr[k+1];
185
186             j1    = 3*jnr1;        
187             j2    = 3*jnr2;
188
189             load6_and_merge(pos,j1,j2,jx,jy,jz);
190
191             dx11   = __fpsub(ix1,jx);
192             dy11   = __fpsub(iy1,jy);
193             dz11   = __fpsub(iz1,jz);
194             dx21   = __fpsub(ix2,jx);
195             dy21   = __fpsub(iy2,jy);
196             dz21   = __fpsub(iz2,jz);
197             dx31   = __fpsub(ix3,jx);
198             dy31   = __fpsub(iy3,jy);
199             dz31   = __fpsub(iz3,jz);
200
201             qj = __cmplx(charge[jnr1],charge[jnr2]);
202
203             rsq11  = vdist2(dx11,dy11,dz11);
204             rsq21  = vdist2(dx21,dy21,dz21);
205             rsq31  = vdist2(dx31,dy31,dz31);
206
207             rinv11 = __fprsqrte(rsq11);
208             rinv21 = __fprsqrte(rsq21);
209             rinv31 = __fprsqrte(rsq31);
210
211             rinv11 = sqrt_newton(rinv11,rsq11);
212             rinv21 = sqrt_newton(rinv21,rsq21);
213             rinv31 = sqrt_newton(rinv31,rsq31);
214
215 #ifdef GMX_DOUBLE
216
217             rinv11 = sqrt_newton(rinv11,rsq11);
218             rinv21 = sqrt_newton(rinv21,rsq21);
219             rinv31 = sqrt_newton(rinv31,rsq31);
220
221 #endif
222             qq = __fpmul(qO,qj);
223
224 #ifndef NO_FORCE
225             load6_and_merge(faction,j1,j2,fjx,fjy,fjz);
226 #endif
227
228             FULL_INTERACTION(qq,rinv11,rsq11,conv1,jnr1,jnr2);
229
230             qq = __fpmul(qH,qj);
231
232 #ifndef NO_FORCE
233             fix1  = __fpnmsub(fix1,dx11,fscal);
234             fiy1  = __fpnmsub(fiy1,dy11,fscal);      
235             fiz1  = __fpnmsub(fiz1,dz11,fscal);
236
237             fjx   = __fpmadd(fjx,dx11,fscal);
238             fjy   = __fpmadd(fjy,dy11,fscal);
239             fjz   = __fpmadd(fjz,dz11,fscal);
240 #endif
241
242             COUL_INTERACTION(qq,rinv21,rsq21,conv2,jnr1,jnr2);
243
244 #ifndef NO_FORCE
245             fix2  = __fpnmsub(fix2,dx21,fscal);
246             fiy2  = __fpnmsub(fiy2,dy21,fscal);      
247             fiz2  = __fpnmsub(fiz2,dz21,fscal);
248
249             fjx   = __fpmadd(fjx,dx21,fscal);
250             fjy   = __fpmadd(fjy,dy21,fscal);
251             fjz   = __fpmadd(fjz,dz21,fscal);
252 #endif
253
254             COUL_INTERACTION(qq,rinv31,rsq31,conv3,jnr1,jnr2);
255
256 #ifndef NO_FORCE
257             fix3  = __fpnmsub(fix3,dx31,fscal);
258             fiy3  = __fpnmsub(fiy3,dy31,fscal);      
259             fiz3  = __fpnmsub(fiz3,dz31,fscal);
260
261             fjx   = __fpmadd(fjx,dx31,fscal);
262             fjy   = __fpmadd(fjy,dy31,fscal);
263             fjz   = __fpmadd(fjz,dz31,fscal);
264
265             split_and_store6(faction,j1,j2,fjx,fjy,fjz);
266 #endif
267         }
268
269         ix23 = __cmplx(_ix2,_ix3);
270         iy23 = __cmplx(_iy2,_iy3);
271         iz23 = __cmplx(_iz2,_iz3);
272
273
274         // actually we should not simdize the remainder loop, because it's slower
275
276         for(;(k<nj1); k++)
277         {
278             double _Complex dx23,dy23,dz23,tx,ty,tz,jx,jy,jz;
279             double _Complex fjx,fjy,fjz;
280             double _Complex rsq23,rinv23;
281             double _Complex rinvsq,krsq,fscal;
282             double _Complex rt,rti,r,eps,Y,F,G,H,GHeps,VV,FF,fijC,fijD,n0,qq;
283             
284             real _dx11,_dy11,_dz11,_rsq11,_rinv11;
285             real _rinvsq,_krsq,_fscal;
286             real _rt,_r,_eps,_Y,_F,_H,_G,_GHeps,_VV,_FF,_fijC,_fijD,_n0,_qq;
287             real _qj,_c6,_c12,_cexp1,_cexp2,_Vvdwexp,_Vvdw6,_Vvdw12,_rinvsix;
288
289             int nnn1,nnn2;
290             int jnr,j3,tj;
291
292             jnr    = jjnr[k];
293             j3     = 3*jnr;
294
295             load3_and_clone(pos,j3,jx,jy,jz);
296
297             _dx11    = __creal(ix1) - __creal(jx);
298             _dy11    = __creal(iy1) - __creal(jy);
299             _dz11    = __creal(iz1) - __creal(jz);
300             dx23     = __fpsub(ix23,jx);
301             dy23     = __fpsub(iy23,jy);
302             dz23     = __fpsub(iz23,jz);
303
304             _rsq11   = _dx11 * _dx11 + _dy11 * _dy11 + _dz11 * _dz11;
305             rsq23    = vdist2(dx23,dy23,dz23);
306
307             _rinv11  = __frsqrte(_rsq11);
308             rinv23   = __fprsqrte(rsq23 );
309
310             _rinv11  = sqrt_newton_scalar(_rinv11,_rsq11);
311             rinv23   = sqrt_newton(rinv23,rsq23);
312
313 #ifdef GMX_DOUBLE
314
315             _rinv11  = sqrt_newton_scalar(_rinv11,_rsq11);
316             rinv23   = sqrt_newton(rinv23,rsq23);
317
318 #endif
319
320             _qq       = _qO * charge[jnr];
321
322 #ifndef NO_FORCE
323             load3_real(faction,j3,fjx,fjy,fjz);
324 #endif
325
326             FULL_INTERACTION_(_qq,_rinv11,_rsq11,jnr);
327
328             qq        = __fxpmul(qH,charge[jnr]);
329
330 #ifndef NO_FORCE
331             fix1     -= _fscal * _dx11;
332             fiy1     -= _fscal * _dy11;
333             fiz1     -= _fscal * _dz11;
334
335             fjx      += _fscal * _dx11;
336             fjy      += _fscal * _dy11;
337             fjz      += _fscal * _dz11;
338 #endif
339
340             COUL_INTERACTION(qq,rinv23,rsq23,conv1,jnr,jnr);
341
342 #ifndef NO_FORCE
343             tx    = __fpmul(fscal,dx23);
344             ty    = __fpmul(fscal,dy23);
345             tz    = __fpmul(fscal,dz23);
346
347             fix2  = __fpnmsub(fix2,rone,tx);
348             fiy2  = __fpnmsub(fiy2,rone,ty);
349             fiz2  = __fpnmsub(fiz2,rone,tz);
350             fix3  = __fxnmsub(fix3,rone,tx);
351             fiy3  = __fxnmsub(fiy3,rone,ty);
352             fiz3  = __fxnmsub(fiz3,rone,tz);
353
354             fjx   = __fpadd(fjx,tx);
355             fjy   = __fpadd(fjy,ty);
356             fjz   = __fpadd(fjz,tz);
357
358             sum_and_store3(faction,j3,fjx,fjy,fjz);
359 #endif
360         }
361
362         ggid = gid[n];
363
364 #ifndef NO_FORCE
365         sum_and_add3(faction,ii3  ,fix1,fiy1,fiz1);
366         sum_and_add3(faction,ii3+3,fix2,fiy2,fiz2);
367         sum_and_add3(faction,ii3+6,fix3,fiy3,fiz3);
368
369         fix = __fpadd(fix1,__fpadd(fix2,fix3));
370         fiy = __fpadd(fiy1,__fpadd(fiy2,fiy3));
371         fiz = __fpadd(fiz1,__fpadd(fiz2,fiz3));
372
373         sum_and_add3(fshift,is3,fix,fiy,fiz);
374 #endif
375
376 #if COULOMB != COULOMB_NONE
377         Vc[ggid]   += __creal(vctot)   + __cimag(vctot);
378 #endif
379
380 #if VDW != VDW_NONE
381         Vvdw[ggid] += __creal(Vvdwtot)   + __cimag(Vvdwtot);
382 #endif
383     }
384          
385     *outeriter       = nri;            
386     *inneriter       = nj1;            
387 }