Merge gromacs-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_bluegene / nb_kernel_gen_bluegene.h
1 #include "types/simple.h"
2
3 #undef FULL_INTERACTION
4 #undef FULL_INTERACTION_
5 #undef COUL_INTERACTION
6 #undef COUL_INTERACTION_
7
8 #ifdef NO_FORCE
9
10  #define FULL_INTERACTION  nfcalc_interaction
11  #define FULL_INTERACTION_ nfcalc_interaction_
12  #define COUL_INTERACTION  nfcalc_coul_interaction
13  #define COUL_INTERACTION_ nfcalc_coul_interaction_
14
15 #else
16
17  #define FULL_INTERACTION  calc_interaction
18  #define FULL_INTERACTION_ calc_interaction_
19  #define COUL_INTERACTION  calc_coul_interaction
20  #define COUL_INTERACTION_ calc_coul_interaction_
21
22 #endif
23
24 void NB_KERNEL (
25                     int *    p_nri,
26                     int *    iinr,
27                     int *    jindex,
28                     int *    jjnr,
29                     int *    shift,
30                     real *   shiftvec,
31                     real *   fshift,
32                     int *    gid,
33                     real *   pos,
34                     real *   faction,
35                     real *   charge,
36                     real *   p_facel,
37                     real *   p_krf,
38                     real *   p_crf,
39                     real *   Vc,
40                     int *    type,
41                     int *    p_ntype,
42                     real *   vdwparam,
43                     real *   Vvdw,
44                     real *   p_tabscale,
45                     real *   VFtab,
46                     real *   invsqrta,
47                     real *   dvda,
48                     real *   p_gbtabscale,
49                     real *   GBtab,
50                     int *    p_nthreads,
51                     int *    count,
52                     void *   mtx,
53                     int *    outeriter,
54                     int *    inneriter,
55                     real *   work)
56 {
57     double _Complex zero  = __cmplx(0.0,0.0);
58     double _Complex rone  = __cmplx(1.0,0.0);
59     double _Complex three = __cmplx(3.0,3.0);
60     double conv1[2],conv2[2],conv3[2];
61
62     real   _facel,_tabscale,_gbtabscale,_krf,_crf;
63     int    nri,ntype,nthreads,n,ii,nj0,nj1;
64
65 #pragma disjoint(*shiftvec,*fshift,*pos,*faction,*charge,*p_facel,*p_krf,*p_crf,*Vc, \
66                  *vdwparam,*Vvdw,*p_tabscale,*VFtab,*invsqrta,*dvda,*p_gbtabscale,*GBtab,*work)
67
68     nri              = *p_nri;         
69     ntype            = *p_ntype;       
70     nthreads         = *p_nthreads;    
71     _facel           = *p_facel;       
72 #if (COULOMB == COULOMB_TAB || VDW == VDW_TAB)
73     _tabscale        = *p_tabscale;
74 #else
75     _tabscale        = 0.0;
76 #endif
77 #if COULOMB == REACTION_FIELD
78     _krf             = *p_krf;
79     _crf             = *p_crf;
80 #else
81     _krf             = 0.0;
82     _crf             = 0.0;
83 #endif
84 #if COULOMB == GENERALIZED_BORN
85     _gbtabscale      = *p_gbtabscale;
86 #else
87     _gbtabscale      = 0.0;
88 #endif
89     nj1              = 0;              
90
91     for(n=0; (n<nri); n++)
92     {
93         double _Complex ix,iy,iz;
94         
95         // initialize potential energies and forces for this paricle
96
97         double _Complex vctot    = zero;              
98         double _Complex Vvdwtot  = zero;
99         double _Complex dvdasum  = zero;
100         double _Complex fix      = zero;
101         double _Complex fiy      = zero;
102         double _Complex fiz      = zero;
103
104         // shift is the position of the n-th water group
105
106         int is3  = 3*shift[n];
107         
108         // shiftvec is the center of a water group
109
110         real _shX  = shiftvec[is3];  
111         real _shY  = shiftvec[is3+1];
112         real _shZ  = shiftvec[is3+2];
113
114         int ii  = iinr[n];        
115         int ii3 = 3*ii;
116 #if VDW == BUCKINGHAM
117         int nti = 3*ntype*type[ii];
118 #else
119         int nti = 2*ntype*type[ii];
120 #endif
121         int k,ggid;
122
123         real _iq   = _facel * charge[ii];
124 #if COULOMB == GENERALIZED_BORN
125         real _isai = invsqrta[ii];
126 #else
127         real _isai = 0.0;
128 #endif
129
130         // add the shift vector to all water atoms
131
132         real _ix = _shX + pos[ii3+0];
133         real _iy = _shY + pos[ii3+1];
134         real _iz = _shZ + pos[ii3+2];
135         
136         ix = __cmplx(_ix,_ix);
137         iy = __cmplx(_iy,_iy);
138         iz = __cmplx(_iz,_iz);
139
140         nj0 = jindex[n];      
141         nj1 = jindex[n+1];    
142
143         /* unrolled 6 times : unrolled twice for SIMDization and three times to hide dependencies */
144
145         for(k=nj0; (k<nj1-5); k+=6)
146         {
147             double _Complex dx1,dy1,dz1,fjx1,fjy1,fjz1;
148             double _Complex rsq1,rinv1;
149             double _Complex dx2,dy2,dz2,fjx2,fjy2,fjz2;
150             double _Complex rsq2,rinv2;
151             double _Complex dx3,dy3,dz3,fjx3,fjy3,fjz3;
152             double _Complex rsq3,rinv3;
153
154             double _Complex rinvsq,krsq,fscal;
155             double _Complex rt,rti,r,eps,Y,F,G,H,GHeps,VV,FF,fijC,fijD,n0,qq;
156             double _Complex isaprod,isaj,iqq,gbscale,dvdaj,vcoul,dvdatmp;
157             double _Complex c6,c12,cexp1,cexp2,rinvsix,Vvdw6,Vvdw12,Vvdwexp;
158
159             int nnn1,nnn2;
160             int jnr11,jnr21,jnr12,jnr22,jnr13,jnr23,j11,j21,j12,j22,j13,j23,tj1,tj2;
161
162             jnr11  = jjnr[k+0];
163             jnr21  = jjnr[k+1];
164             jnr12  = jjnr[k+2];
165             jnr22  = jjnr[k+3];
166             jnr13  = jjnr[k+4];
167             jnr23  = jjnr[k+5];
168
169             j11    = 3*jnr11;        
170             j21    = 3*jnr21;
171             j12    = 3*jnr12;        
172             j22    = 3*jnr22;
173             j13    = 3*jnr13;        
174             j23    = 3*jnr23;
175
176             load6_and_merge(pos,j11,j21,dx1,dy1,dz1);
177             load6_and_merge(pos,j12,j22,dx2,dy2,dz2);
178             load6_and_merge(pos,j13,j23,dx3,dy3,dz3);
179
180             dx1   = __fpsub(ix,dx1);
181             dy1   = __fpsub(iy,dy1);
182             dz1   = __fpsub(iz,dz1);
183
184             dx2   = __fpsub(ix,dx2);
185             dy2   = __fpsub(iy,dy2);
186             dz2   = __fpsub(iz,dz2);
187
188             dx3   = __fpsub(ix,dx3);
189             dy3   = __fpsub(iy,dy3);
190             dz3   = __fpsub(iz,dz3);
191
192 #if COULOMB != COULOMB_NONE
193             qq   = __fxpmul(__cmplx(charge[jnr11],charge[jnr21]),_iq);
194
195 #if COULOMB == GENERALIZED_BORN
196             dvdaj = __cmplx(dvda[jnr11],dvda[jnr21]);
197 #endif
198 #endif
199
200             rsq1  = vdist2(dx1,dy1,dz1);
201             rsq2  = vdist2(dx2,dy2,dz2);
202             rsq3  = vdist2(dx3,dy3,dz3);
203
204             rinv1 = __fprsqrte(rsq1);
205             rinv2 = __fprsqrte(rsq2);
206             rinv3 = __fprsqrte(rsq3);
207
208             rinv1 = sqrt_newton(rinv1,rsq1);
209             rinv2 = sqrt_newton(rinv2,rsq2);
210             rinv3 = sqrt_newton(rinv3,rsq3);
211
212 #ifdef GMX_DOUBLE
213
214             rinv1 = sqrt_newton(rinv1,rsq1);
215             rinv2 = sqrt_newton(rinv2,rsq2);
216             rinv3 = sqrt_newton(rinv3,rsq3);
217
218 #endif
219
220 #ifndef NO_FORCE
221             load6_and_merge(faction,j11,j21,fjx1,fjy1,fjz1);
222 #endif
223             
224             FULL_INTERACTION(qq,rinv1,rsq1,conv1,jnr11,jnr21);
225
226 #if COULOMB != COULOMB_NONE
227             qq    = __fxpmul(__cmplx(charge[jnr12],charge[jnr22]),_iq);
228
229 #if COULOMB == GENERALIZED_BORN
230             dvda[jnr11] -= __creal(dvdaj);
231             dvda[jnr21] -= __cimag(dvdaj);
232
233             dvdaj = __cmplx(dvda[jnr12],dvda[jnr22]);
234 #endif
235 #endif
236
237 #ifndef NO_FORCE
238             fix   = __fpnmsub(fix,dx1,fscal);
239             fiy   = __fpnmsub(fiy,dy1,fscal);      
240             fiz   = __fpnmsub(fiz,dz1,fscal);
241
242             fjx1  = __fpmadd(fjx1,dx1,fscal);
243             fjy1  = __fpmadd(fjy1,dy1,fscal);
244             fjz1  = __fpmadd(fjz1,dz1,fscal);
245
246             load6_and_merge(faction,j12,j22,fjx2,fjy2,fjz2);
247 #endif
248
249             FULL_INTERACTION(qq,rinv2,rsq2,conv2,jnr12,jnr22);
250
251 #if COULOMB != COULOMB_NONE
252                 qq    = __fxpmul(__cmplx(charge[jnr13],charge[jnr23]),_iq);
253                          
254 #if COULOMB == GENERALIZED_BORN
255             dvda[jnr12] -= __creal(dvdaj);
256             dvda[jnr22] -= __cimag(dvdaj);
257
258             dvdaj = __cmplx(dvda[jnr13],dvda[jnr23]);
259 #endif
260 #endif
261                         
262 #ifndef NO_FORCE
263             split_and_store6(faction,j11,j21,fjx1,fjy1,fjz1);
264
265             fix   = __fpnmsub(fix,dx2,fscal);
266             fiy   = __fpnmsub(fiy,dy2,fscal);      
267             fiz   = __fpnmsub(fiz,dz2,fscal);
268
269             fjx2  = __fpmadd(fjx2,dx2,fscal);
270             fjy2  = __fpmadd(fjy2,dy2,fscal);
271             fjz2  = __fpmadd(fjz2,dz2,fscal);
272
273             load6_and_merge(faction,j13,j23,fjx3,fjy3,fjz3);
274 #endif
275
276             FULL_INTERACTION(qq,rinv3,rsq3,conv3,jnr13,jnr23);
277
278 #if COULOMB == GENERALIZED_BORN
279             dvda[jnr13] -= __creal(dvdaj);
280             dvda[jnr23] -= __cimag(dvdaj);
281 #endif
282
283 #ifndef NO_FORCE
284             split_and_store6(faction,j12,j22,fjx2,fjy2,fjz2);
285
286             fix   = __fpnmsub(fix,dx3,fscal);
287             fiy   = __fpnmsub(fiy,dy3,fscal);      
288             fiz   = __fpnmsub(fiz,dz3,fscal);
289
290             fjx3  = __fpmadd(fjx3,dx3,fscal);
291             fjy3  = __fpmadd(fjy3,dy3,fscal);
292             fjz3  = __fpmadd(fjz3,dz3,fscal);
293
294             split_and_store6(faction,j13,j23,fjx3,fjy3,fjz3);
295 #endif
296         }
297
298         for(;(k<nj1); k++)
299         {
300             real _dx,_dy,_dz,_rsq,_rinv;
301             real _rinvsq,_krsq,_fscal;
302             real _r,_rt,_eps,_Y,_F,_G,_H,_GHeps,_VV,_FF,_fijC,_fijD,_n0,_qq;
303             real _isaprod,_isaj,_iqq,_dvdatmp,_vcoul,_gbscale;
304             real _c6,_c12,_cexp1,_cexp2,_rinvsix,_Vvdw6,_Vvdw12,_Vvdwexp;
305
306             int nnn1;
307             int jnr,j3,tj;
308
309             jnr    = jjnr[k];        
310             j3     = 3*jnr;          
311
312             _dx    = __creal(ix) - pos[j3+0];
313             _dy    = __creal(iy) - pos[j3+1];
314             _dz    = __creal(iz) - pos[j3+2];
315
316             _qq    = _iq * charge[jnr];
317
318             _rsq   = _dx*_dx + _dy*_dy + _dz*_dz;
319
320             _rinv  = __frsqrte(_rsq);
321
322             _rinv  = sqrt_newton_scalar(_rinv,_rsq);
323
324 #ifdef GMX_DOUBLE
325             
326             _rinv  = sqrt_newton_scalar(_rinv,_rsq);
327
328 #endif
329             FULL_INTERACTION_(_qq,_rinv,_rsq,jnr);
330
331 #ifndef NO_FORCE
332             fix             -= _fscal*_dx; 
333             fiy             -= _fscal*_dy; 
334             fiz             -= _fscal*_dz; 
335             faction[j3+0]   += _fscal*_dx;
336             faction[j3+1]   += _fscal*_dy;
337             faction[j3+2]   += _fscal*_dz;
338 #endif
339         }
340         
341         ggid = gid[n];
342
343 #ifndef NO_FORCE
344         sum_and_add3(faction,ii3,fix,fiy,fiz);
345         sum_and_add3(fshift ,is3,fix,fiy,fiz);
346 #endif
347
348 #if COULOMB != COULOMB_NONE
349         Vc[ggid]    += __creal(vctot)   + __cimag(vctot);
350 #if COULOMB == GENERALIZED_BORN
351         dvda[ii]    += __creal(dvdasum) + __cimag(dvdasum);
352 #endif
353 #endif
354
355 #if VDW != VDW_NONE
356         Vvdw[ggid]  += __creal(Vvdwtot) + __cimag(Vvdwtot);
357 #endif
358     }
359
360     *outeriter  = nri;            
361     *inneriter  = nj1;         
362 }
363