2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
36 #include <emmintrin.h>
39 /* #include "sse_common_single.h" */
42 * Gromacs nonbonded kernel nb_kernel332
43 * Coulomb interaction: Tabulated
44 * VdW interaction: Tabulated
45 * water optimization: pairs of SPC/TIP3P interactions
46 * Calculate forces: yes
48 void nb_kernel332_sse2_single(
81 int nri,ntype,nthreads;
82 float facel,krf,crf,tabscale,gbtabscale;
83 int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
84 int nn0,nn1,nouter,ninner;
93 float Y,F,Geps,Heps2,Fp,VV;
97 float ix1,iy1,iz1,fix1,fiy1,fiz1;
98 float ix2,iy2,iz2,fix2,fiy2,fiz2;
99 float ix3,iy3,iz3,fix3,fiy3,fiz3;
100 float jx1,jy1,jz1,fjx1,fjy1,fjz1;
101 float jx2,jy2,jz2,fjx2,fjy2,fjz2;
102 float jx3,jy3,jz3,fjx3,fjy3,fjz3;
103 float dx11,dy11,dz11,rsq11,rinv11;
104 float dx12,dy12,dz12,rsq12,rinv12;
105 float dx13,dy13,dz13,rsq13,rinv13;
106 float dx21,dy21,dz21,rsq21,rinv21;
107 float dx22,dy22,dz22,rsq22,rinv22;
108 float dx23,dy23,dz23,rsq23,rinv23;
109 float dx31,dy31,dz31,rsq31,rinv31;
110 float dx32,dy32,dz32,rsq32,rinv32;
111 float dx33,dy33,dz33,rsq33,rinv33;
112 float qO,qH,qqOO,qqOH,qqHH;
117 nthreads = *p_nthreads;
121 tabscale = *p_tabscale;
123 /* Initialize water data */
130 tj = 2*(ntype+1)*type[ii];
132 c12 = vdwparam[tj+1];
135 /* Reset outer and inner iteration counters */
139 /* Loop over thread workunits */
143 #ifdef GMX_THREAD_SHM_FDECOMP
144 tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx);
147 /* Take successively smaller chunks (at least 10 lists) */
148 nn1 = nn0+(nri-nn0)/(2*nthreads)+10;
150 tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx);
156 /* Start outer loop over neighborlists */
158 for(n=nn0; (n<nn1); n++)
161 /* Load shift vector for this list */
164 shY = shiftvec[is3+1];
165 shZ = shiftvec[is3+2];
167 /* Load limits for loop over neighbors */
171 /* Get outer coordinate index */
175 /* Load i atom data, add shift vector */
176 ix1 = shX + pos[ii3+0];
177 iy1 = shY + pos[ii3+1];
178 iz1 = shZ + pos[ii3+2];
179 ix2 = shX + pos[ii3+3];
180 iy2 = shY + pos[ii3+4];
181 iz2 = shZ + pos[ii3+5];
182 ix3 = shX + pos[ii3+6];
183 iy3 = shY + pos[ii3+7];
184 iz3 = shZ + pos[ii3+8];
186 /* Zero the potential energy for this list */
190 /* Clear i atom forces */
201 for(k=nj0; (k<nj1); k++)
204 /* Get j neighbor index, and coordinate index */
208 /* load j atom coordinates */
219 /* Calculate distance */
223 rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
227 rsq12 = dx12*dx12+dy12*dy12+dz12*dz12;
231 rsq13 = dx13*dx13+dy13*dy13+dz13*dz13;
235 rsq21 = dx21*dx21+dy21*dy21+dz21*dz21;
239 rsq22 = dx22*dx22+dy22*dy22+dz22*dz22;
243 rsq23 = dx23*dx23+dy23*dy23+dz23*dz23;
247 rsq31 = dx31*dx31+dy31*dy31+dz31*dz31;
251 rsq32 = dx32*dx32+dy32*dy32+dz32*dz32;
255 rsq33 = dx33*dx33+dy33*dy33+dz33*dz33;
257 /* Calculate 1/r and 1/r2 */
258 rinv11 = 1.0/sqrt(rsq11);
259 rinv12 = 1.0/sqrt(rsq12);
260 rinv13 = 1.0/sqrt(rsq13);
261 rinv21 = 1.0/sqrt(rsq21);
262 rinv22 = 1.0/sqrt(rsq22);
263 rinv23 = 1.0/sqrt(rsq23);
264 rinv31 = 1.0/sqrt(rsq31);
265 rinv32 = 1.0/sqrt(rsq32);
266 rinv33 = 1.0/sqrt(rsq33);
268 /* Load parameters for j atom */
271 /* Calculate table index */
274 /* Calculate table index */
281 /* Tabulated coulomb interaction */
284 Geps = eps*VFtab[nnn+2];
285 Heps2 = eps2*VFtab[nnn+3];
288 FF = Fp+Geps+2.0*Heps2;
291 vctot = vctot + vcoul;
293 /* Tabulated VdW interaction - dispersion */
297 Geps = eps*VFtab[nnn+2];
298 Heps2 = eps2*VFtab[nnn+3];
301 FF = Fp+Geps+2.0*Heps2;
305 /* Tabulated VdW interaction - repulsion */
309 Geps = eps*VFtab[nnn+2];
310 Heps2 = eps2*VFtab[nnn+3];
313 FF = Fp+Geps+2.0*Heps2;
316 Vvdwtot = Vvdwtot+ Vvdw6 + Vvdw12;
317 fscal = -((fijC+fijD+fijR)*tabscale)*rinv11;
319 /* Calculate temporary vectorial force */
324 /* Increment i atom force */
329 /* Decrement j atom force */
330 fjx1 = faction[j3+0] - tx;
331 fjy1 = faction[j3+1] - ty;
332 fjz1 = faction[j3+2] - tz;
334 /* Load parameters for j atom */
337 /* Calculate table index */
340 /* Calculate table index */
347 /* Tabulated coulomb interaction */
350 Geps = eps*VFtab[nnn+2];
351 Heps2 = eps2*VFtab[nnn+3];
354 FF = Fp+Geps+2.0*Heps2;
357 vctot = vctot + vcoul;
358 fscal = -((fijC)*tabscale)*rinv12;
360 /* Calculate temporary vectorial force */
365 /* Increment i atom force */
370 /* Decrement j atom force */
371 fjx2 = faction[j3+3] - tx;
372 fjy2 = faction[j3+4] - ty;
373 fjz2 = faction[j3+5] - tz;
375 /* Load parameters for j atom */
378 /* Calculate table index */
381 /* Calculate table index */
388 /* Tabulated coulomb interaction */
391 Geps = eps*VFtab[nnn+2];
392 Heps2 = eps2*VFtab[nnn+3];
395 FF = Fp+Geps+2.0*Heps2;
398 vctot = vctot + vcoul;
399 fscal = -((fijC)*tabscale)*rinv13;
401 /* Calculate temporary vectorial force */
406 /* Increment i atom force */
411 /* Decrement j atom force */
412 fjx3 = faction[j3+6] - tx;
413 fjy3 = faction[j3+7] - ty;
414 fjz3 = faction[j3+8] - tz;
416 /* Load parameters for j atom */
419 /* Calculate table index */
422 /* Calculate table index */
429 /* Tabulated coulomb interaction */
432 Geps = eps*VFtab[nnn+2];
433 Heps2 = eps2*VFtab[nnn+3];
436 FF = Fp+Geps+2.0*Heps2;
439 vctot = vctot + vcoul;
440 fscal = -((fijC)*tabscale)*rinv21;
442 /* Calculate temporary vectorial force */
447 /* Increment i atom force */
452 /* Decrement j atom force */
457 /* Load parameters for j atom */
460 /* Calculate table index */
463 /* Calculate table index */
470 /* Tabulated coulomb interaction */
473 Geps = eps*VFtab[nnn+2];
474 Heps2 = eps2*VFtab[nnn+3];
477 FF = Fp+Geps+2.0*Heps2;
480 vctot = vctot + vcoul;
481 fscal = -((fijC)*tabscale)*rinv22;
483 /* Calculate temporary vectorial force */
488 /* Increment i atom force */
493 /* Decrement j atom force */
498 /* Load parameters for j atom */
501 /* Calculate table index */
504 /* Calculate table index */
511 /* Tabulated coulomb interaction */
514 Geps = eps*VFtab[nnn+2];
515 Heps2 = eps2*VFtab[nnn+3];
518 FF = Fp+Geps+2.0*Heps2;
521 vctot = vctot + vcoul;
522 fscal = -((fijC)*tabscale)*rinv23;
524 /* Calculate temporary vectorial force */
529 /* Increment i atom force */
534 /* Decrement j atom force */
539 /* Load parameters for j atom */
542 /* Calculate table index */
545 /* Calculate table index */
552 /* Tabulated coulomb interaction */
555 Geps = eps*VFtab[nnn+2];
556 Heps2 = eps2*VFtab[nnn+3];
559 FF = Fp+Geps+2.0*Heps2;
562 vctot = vctot + vcoul;
563 fscal = -((fijC)*tabscale)*rinv31;
565 /* Calculate temporary vectorial force */
570 /* Increment i atom force */
575 /* Decrement j atom force */
576 faction[j3+0] = fjx1 - tx;
577 faction[j3+1] = fjy1 - ty;
578 faction[j3+2] = fjz1 - tz;
580 /* Load parameters for j atom */
583 /* Calculate table index */
586 /* Calculate table index */
593 /* Tabulated coulomb interaction */
596 Geps = eps*VFtab[nnn+2];
597 Heps2 = eps2*VFtab[nnn+3];
600 FF = Fp+Geps+2.0*Heps2;
603 vctot = vctot + vcoul;
604 fscal = -((fijC)*tabscale)*rinv32;
606 /* Calculate temporary vectorial force */
611 /* Increment i atom force */
616 /* Decrement j atom force */
617 faction[j3+3] = fjx2 - tx;
618 faction[j3+4] = fjy2 - ty;
619 faction[j3+5] = fjz2 - tz;
621 /* Load parameters for j atom */
624 /* Calculate table index */
627 /* Calculate table index */
634 /* Tabulated coulomb interaction */
637 Geps = eps*VFtab[nnn+2];
638 Heps2 = eps2*VFtab[nnn+3];
641 FF = Fp+Geps+2.0*Heps2;
644 vctot = vctot + vcoul;
645 fscal = -((fijC)*tabscale)*rinv33;
647 /* Calculate temporary vectorial force */
652 /* Increment i atom force */
657 /* Decrement j atom force */
658 faction[j3+6] = fjx3 - tx;
659 faction[j3+7] = fjy3 - ty;
660 faction[j3+8] = fjz3 - tz;
662 /* Inner loop uses 395 flops/iteration */
666 /* Add i forces to mem and shifted force list */
667 faction[ii3+0] = faction[ii3+0] + fix1;
668 faction[ii3+1] = faction[ii3+1] + fiy1;
669 faction[ii3+2] = faction[ii3+2] + fiz1;
670 faction[ii3+3] = faction[ii3+3] + fix2;
671 faction[ii3+4] = faction[ii3+4] + fiy2;
672 faction[ii3+5] = faction[ii3+5] + fiz2;
673 faction[ii3+6] = faction[ii3+6] + fix3;
674 faction[ii3+7] = faction[ii3+7] + fiy3;
675 faction[ii3+8] = faction[ii3+8] + fiz3;
676 fshift[is3] = fshift[is3]+fix1+fix2+fix3;
677 fshift[is3+1] = fshift[is3+1]+fiy1+fiy2+fiy3;
678 fshift[is3+2] = fshift[is3+2]+fiz1+fiz2+fiz3;
680 /* Add potential energies to the group for this list */
682 Vc[ggid] = Vc[ggid] + vctot;
683 Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
685 /* Increment number of inner iterations */
686 ninner = ninner + nj1 - nj0;
688 /* Outer loop uses 29 flops/iteration */
692 /* Increment number of outer iterations */
693 nouter = nouter + nn1 - nn0;
698 /* Write outer/inner iteration count to pointers */
704 int nb_kernel332_sse2_single_dummy;