1 #include "types/simple.h"
3 #undef FULL_INTERACTION
4 #undef FULL_INTERACTION_
5 #undef COUL_INTERACTION
6 #undef COUL_INTERACTION_
8 #undef VDW_INTERACTION_
12 #define FULL_INTERACTION nfcalc_interaction
13 #define FULL_INTERACTION_ nfcalc_interaction_
14 #define COUL_INTERACTION nfcalc_coul_interaction
15 #define COUL_INTERACTION_ nfcalc_coul_interaction_
16 #define VDW_INTERACTION nfcalc_vdw_interaction
17 #define VDW_INTERACTION_ nfcalc_vdw_interaction_
21 #define FULL_INTERACTION calc_interaction
22 #define FULL_INTERACTION_ calc_interaction_
23 #define COUL_INTERACTION calc_coul_interaction
24 #define COUL_INTERACTION_ calc_coul_interaction_
25 #define VDW_INTERACTION calc_vdw_interaction
26 #define VDW_INTERACTION_ calc_vdw_interaction_
64 double _Complex qM,qH;
65 double _Complex zero = __cmplx(0.0,0.0);
66 double _Complex rone = __cmplx(1.0,0.0);
67 double _Complex three = __cmplx(3.0,3.0);
68 double conv1[2],conv2[2],conv3[2],conv4[2];
70 real _qM,_qH,_facel,_tabscale,_gbtabscale,_krf,_crf;
72 int nri,ntype,nthreads,n,ii,nj0,nj1,nti;
74 #pragma disjoint(*shiftvec,*fshift,*pos,*faction,*charge,*p_facel,*p_krf,*p_crf,*Vc, \
75 *vdwparam,*Vvdw,*p_tabscale,*VFtab,*invsqrta,*dvda,*p_gbtabscale,*GBtab,*work)
79 nthreads = *p_nthreads;
81 #if (COULOMB == COULOMB_TAB || VDW == VDW_TAB)
82 _tabscale = *p_tabscale;
86 #if COULOMB == REACTION_FIELD
93 #if COULOMB == GENERALIZED_BORN
94 _gbtabscale = *p_gbtabscale;
100 _qH = _facel * charge[ii+1];
101 _qM = _facel * charge[ii+3];
102 qM = __cmplx(_qM,_qM);
103 qH = __cmplx(_qH,_qH);
104 #if VDW == BUCKINGHAM
105 nti = 3*ntype*type[ii];
107 nti = 2*ntype*type[ii];
112 for(n=0; (n<nri); n++)
114 double _Complex ix1,ix2,ix3,ix4,iy1,iy2,iy3,iy4,iz1,iz2,iz3,iz4,ix23,iy23,iz23,ix14,iy14,iz14;
115 double _Complex fix,fiy,fiz;
117 // initialize potential energies and forces for this paricle
119 double _Complex vctot = zero;
120 double _Complex Vvdwtot = zero;
121 double _Complex fix1 = zero;
122 double _Complex fix2 = zero;
123 double _Complex fix3 = zero;
124 double _Complex fix4 = zero;
125 double _Complex fiy1 = zero;
126 double _Complex fiy2 = zero;
127 double _Complex fiy3 = zero;
128 double _Complex fiy4 = zero;
129 double _Complex fiz1 = zero;
130 double _Complex fiz2 = zero;
131 double _Complex fiz3 = zero;
132 double _Complex fiz4 = zero;
134 // shift is the position of the n-th water group
136 int is3 = 3*shift[n];
138 // shiftvec is the center of a water group
140 real _shX = shiftvec[is3+0];
141 real _shY = shiftvec[is3+1];
142 real _shZ = shiftvec[is3+2];
148 // add the shift vector to all water atoms
150 real _ix1 = _shX + pos[ii3+0];
151 real _iy1 = _shY + pos[ii3+1];
152 real _iz1 = _shZ + pos[ii3+2];
153 real _ix2 = _shX + pos[ii3+3];
154 real _iy2 = _shY + pos[ii3+4];
155 real _iz2 = _shZ + pos[ii3+5];
156 real _ix3 = _shX + pos[ii3+6];
157 real _iy3 = _shY + pos[ii3+7];
158 real _iz3 = _shZ + pos[ii3+8];
159 real _ix4 = _shX + pos[ii3+9];
160 real _iy4 = _shY + pos[ii3+10];
161 real _iz4 = _shZ + pos[ii3+11];
163 // clone all positions in complex variables
165 ix1 = __cmplx(_ix1,_ix1);
166 iy1 = __cmplx(_iy1,_iy1);
167 iz1 = __cmplx(_iz1,_iz1);
169 ix2 = __cmplx(_ix2,_ix2);
170 iy2 = __cmplx(_iy2,_iy2);
171 iz2 = __cmplx(_iz2,_iz2);
173 ix3 = __cmplx(_ix3,_ix3);
174 iy3 = __cmplx(_iy3,_iy3);
175 iz3 = __cmplx(_iz3,_iz3);
177 ix4 = __cmplx(_ix4,_ix4);
178 iy4 = __cmplx(_iy4,_iy4);
179 iz4 = __cmplx(_iz4,_iz4);
184 // unrolled twice for SIMDization
186 for(k=nj0; (k<nj1-1); k+=2)
188 double _Complex dx11,dx21,dx31,dx41,jx,fjx;
189 double _Complex dy11,dy21,dy31,dy41,jy,fjy;
190 double _Complex dz11,dz21,dz31,dz41,jz,fjz;
191 double _Complex rsq11,rsq21,rsq31,rsq41;
192 double _Complex rinv11,rinv21,rinv31,rinv41;
193 double _Complex rinvsq,krsq,fscal;
194 double _Complex rt,rti,r,n0,eps,Y,F,G,H,GHeps,VV,FF,fijC,fijD,qj,qq;
195 double _Complex c6,c12,cexp1,cexp2,Vvdwexp,Vvdw6,Vvdw12,rinvsix;
198 int jnr1,jnr2,j1,j2,tj1,tj2;
206 load6_and_merge(pos,j1,j2,jx,jy,jz);
208 dx11 = __fpsub(ix1,jx);
209 dy11 = __fpsub(iy1,jy);
210 dz11 = __fpsub(iz1,jz);
211 dx21 = __fpsub(ix2,jx);
212 dy21 = __fpsub(iy2,jy);
213 dz21 = __fpsub(iz2,jz);
214 dx31 = __fpsub(ix3,jx);
215 dy31 = __fpsub(iy3,jy);
216 dz31 = __fpsub(iz3,jz);
217 dx41 = __fpsub(ix4,jx);
218 dy41 = __fpsub(iy4,jy);
219 dz41 = __fpsub(iz4,jz);
221 qj = __cmplx(charge[jnr1],charge[jnr2]);
223 rsq11 = vdist2(dx11,dy11,dz11);
224 rsq21 = vdist2(dx21,dy21,dz21);
225 rsq31 = vdist2(dx31,dy31,dz31);
226 rsq41 = vdist2(dx41,dy41,dz41);
228 rinv11 = __fprsqrte(rsq11);
229 rinv21 = __fprsqrte(rsq21);
230 rinv31 = __fprsqrte(rsq31);
231 rinv41 = __fprsqrte(rsq41);
233 rinv11 = sqrt_newton(rinv11,rsq11);
234 rinv21 = sqrt_newton(rinv21,rsq21);
235 rinv31 = sqrt_newton(rinv31,rsq31);
236 rinv41 = sqrt_newton(rinv41,rsq41);
240 rinv11 = sqrt_newton(rinv11,rsq11);
241 rinv21 = sqrt_newton(rinv21,rsq21);
242 rinv31 = sqrt_newton(rinv31,rsq31);
243 rinv41 = sqrt_newton(rinv41,rsq41);
248 load6_and_merge(faction,j1,j2,fjx,fjy,fjz);
251 VDW_INTERACTION(rinv11,rsq11,conv1,jnr1,jnr2);
256 fix1 = __fpnmsub(fix1,dx11,fscal);
257 fiy1 = __fpnmsub(fiy1,dy11,fscal);
258 fiz1 = __fpnmsub(fiz1,dz11,fscal);
260 fjx = __fpmadd(fjx,dx11,fscal);
261 fjy = __fpmadd(fjy,dy11,fscal);
262 fjz = __fpmadd(fjz,dz11,fscal);
265 COUL_INTERACTION(qq,rinv21,rsq21,conv2,jnr1,jnr2);
268 fix2 = __fpnmsub(fix2,dx21,fscal);
269 fiy2 = __fpnmsub(fiy2,dy21,fscal);
270 fiz2 = __fpnmsub(fiz2,dz21,fscal);
272 fjx = __fpmadd(fjx,dx21,fscal);
273 fjy = __fpmadd(fjy,dy21,fscal);
274 fjz = __fpmadd(fjz,dz21,fscal);
277 COUL_INTERACTION(qq,rinv31,rsq31,conv3,jnr1,jnr2);
282 fix3 = __fpnmsub(fix3,dx31,fscal);
283 fiy3 = __fpnmsub(fiy3,dy31,fscal);
284 fiz3 = __fpnmsub(fiz3,dz31,fscal);
286 fjx = __fpmadd(fjx,dx31,fscal);
287 fjy = __fpmadd(fjy,dy31,fscal);
288 fjz = __fpmadd(fjz,dz31,fscal);
291 COUL_INTERACTION(qq,rinv41,rsq41,conv4,jnr1,jnr2);
294 fix4 = __fpnmsub(fix4,dx41,fscal);
295 fiy4 = __fpnmsub(fiy4,dy41,fscal);
296 fiz4 = __fpnmsub(fiz4,dz41,fscal);
298 fjx = __fpmadd(fjx,dx41,fscal);
299 fjy = __fpmadd(fjy,dy41,fscal);
300 fjz = __fpmadd(fjz,dz41,fscal);
302 split_and_store6(faction,j1,j2,fjx,fjy,fjz);
306 ix14 = __cmplx(_ix1,_ix4);
307 iy14 = __cmplx(_iy1,_iy4);
308 iz14 = __cmplx(_iz1,_iz4);
310 ix23 = __cmplx(_ix2,_ix3);
311 iy23 = __cmplx(_iy2,_iy3);
312 iz23 = __cmplx(_iz2,_iz3);
314 // actually we should not simdize the remainder loop, because it's bit slower
318 double _Complex dx23,dy23,dz23,dx14,dy14,dz14,jx,jy,jz;
319 double _Complex fjx,fjy,fjz,tx,ty,tz;
320 double _Complex rsq23,rinv23,rsq14,rinv14;
321 double _Complex rinvsq,krsq,fscal;
322 double _Complex rt,rti,r,eps,Y,F,G,H,GHeps,VV,FF,fijC,fijD,n0,qq;
324 real _rinvsq,_krsq,_fscal;
325 real _rt,_r,_eps,_Y,_F,_H,_G,_GHeps,_VV,_FF,_fijC,_fijD,_n0,_qq;
326 real _qj,_c6,_c12,_cexp1,_cexp2,_Vvdwexp,_Vvdw6,_Vvdw12,_rinvsix;
334 load3_and_clone(pos,j3,jx,jy,jz);
336 dx14 = __fpsub(ix14,jx);
337 dy14 = __fpsub(iy14,jy);
338 dz14 = __fpsub(iz14,jz);
339 dx23 = __fpsub(ix23,jx);
340 dy23 = __fpsub(iy23,jy);
341 dz23 = __fpsub(iz23,jz);
343 rsq14 = vdist2(dx14,dy14,dz14);
344 rsq23 = vdist2(dx23,dy23,dz23);
346 rinv14 = __fprsqrte(rsq14 );
347 rinv23 = __fprsqrte(rsq23 );
349 rinv14 = sqrt_newton(rinv14,rsq14);
350 rinv23 = sqrt_newton(rinv23,rsq23);
354 rinv14 = sqrt_newton(rinv14,rsq14);
355 rinv23 = sqrt_newton(rinv23,rsq23);
359 _qq = _qM * charge[jnr];
362 load3_real(faction,j3,fjx,fjy,fjz);
365 VDW_INTERACTION_(__creal(rinv14),__creal(rsq14),jnr);
368 COUL_INTERACTION_(_qq,__cimag(rinv14),__cimag(rsq14),jnr);
369 fscal = __cmplx(__creal(fscal),_fscal);
371 qq = __fxpmul(qH,charge[jnr]);
375 tx = __fpmul(fscal,dx14);
376 ty = __fpmul(fscal,dy14);
377 tz = __fpmul(fscal,dz14);
379 fix1 = __fpnmsub(fix1,rone,tx);
380 fiy1 = __fpnmsub(fiy1,rone,ty);
381 fiz1 = __fpnmsub(fiz1,rone,tz);
382 fix4 = __fxnmsub(fix4,rone,tx);
383 fiy4 = __fxnmsub(fiy4,rone,ty);
384 fiz4 = __fxnmsub(fiz4,rone,tz);
386 fjx = __fpadd(fjx,tx);
387 fjy = __fpadd(fjy,ty);
388 fjz = __fpadd(fjz,tz);
391 COUL_INTERACTION(qq,rinv23,rsq23,conv1,jnr,jnr);
395 tx = __fpmul(fscal,dx23);
396 ty = __fpmul(fscal,dy23);
397 tz = __fpmul(fscal,dz23);
399 fix2 = __fpnmsub(fix2,rone,tx);
400 fiy2 = __fpnmsub(fiy2,rone,ty);
401 fiz2 = __fpnmsub(fiz2,rone,tz);
402 fix3 = __fxnmsub(fix3,rone,tx);
403 fiy3 = __fxnmsub(fiy3,rone,ty);
404 fiz3 = __fxnmsub(fiz3,rone,tz);
406 fjx = __fpadd(fjx,tx);
407 fjy = __fpadd(fjy,ty);
408 fjz = __fpadd(fjz,tz);
410 sum_and_store3(faction,j3,fjx,fjy,fjz);
418 sum_and_add3(faction,ii3 ,fix1,fiy1,fiz1);
419 sum_and_add3(faction,ii3+3,fix2,fiy2,fiz2);
420 sum_and_add3(faction,ii3+6,fix3,fiy3,fiz3);
421 sum_and_add3(faction,ii3+9,fix4,fiy4,fiz4);
423 fix = __fpadd(__fpadd(fix1,fix4),__fpadd(fix2,fix3));
424 fiy = __fpadd(__fpadd(fiy1,fiy4),__fpadd(fiy2,fiy3));
425 fiz = __fpadd(__fpadd(fiz1,fiz4),__fpadd(fiz2,fiz3));
427 sum_and_add3(fshift,is3,fix,fiy,fiz);
430 #if COULOMB != COULOMB_NONE
431 Vc[ggid] += __creal(vctot) + __cimag(vctot);
435 Vvdw[ggid] += __creal(Vvdwtot) + __cimag(Vvdwtot);