2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
39 #include <emmintrin.h>
41 #include "sse_common_single.h"
45 * Gromacs nonbonded kernel nb_kernel112
46 * Coulomb interaction: Normal Coulomb
47 * VdW interaction: Lennard-Jones
48 * water optimization: pairs of SPC/TIP3P interactions
49 * Calculate forces: yes
51 void nb_kernel110_sse2_single(
84 int nri,ntype,nthreads,offset;
85 float facel,krf,crf,tabscale,gbtabscale;
86 int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
90 int j3A,j3B,j3C,j3D,jnrA,jnrB,jnrC,jnrD;
91 float tfloat1[4],tfloat2[4],tfloat3[4];
93 __m128 mask1 = _mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
94 __m128 mask2 = _mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
95 __m128 mask3 = _mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
98 __m128 qqOO,qqOH,qqHH,c6,c12,Vvdwtot,vctot;
99 __m128 dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
100 __m128 dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
101 __m128 dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
102 __m128 fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
103 __m128 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
104 __m128 rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
106 __m128 ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
107 __m128 jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
109 __m128 rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
110 __m128 fscal11,fscal12,fscal13,fscal21,fscal22,fscal23,fscal31,fscal32,fscal33;
112 __m128 half = _mm_set1_ps(0.5);
113 __m128 three = _mm_set1_ps(3.0);
114 __m128 six = _mm_set1_ps(6.0);
115 __m128 twelve = _mm_set1_ps(12.0);
119 nthreads = *p_nthreads;
123 tabscale = *p_tabscale;
127 qqOO = _mm_set1_ps(facel*qO*qO);
128 qqOH = _mm_set1_ps(facel*qO*qH);
129 qqHH = _mm_set1_ps(facel*qH*qH);
130 tj = 2*(ntype+1)*type[ii];
131 c6 = _mm_set1_ps(vdwparam[tj]);
132 c12 = _mm_set1_ps(vdwparam[tj+1]);
136 for(n=0; (n<nri); n++)
140 shY = shiftvec[is3+1];
141 shZ = shiftvec[is3+2];
147 ix1 = _mm_set1_ps(shX+pos[ii3+0]);
148 iy1 = _mm_set1_ps(shY+pos[ii3+1]);
149 iz1 = _mm_set1_ps(shZ+pos[ii3+2]);
151 vctot = _mm_setzero_ps();
152 Vvdwtot = _mm_setzero_ps();
153 fix1 = _mm_setzero_ps();
154 fiy1 = _mm_setzero_ps();
155 fiz1 = _mm_setzero_ps();
157 for(k=nj0; k<nj1-3; k+=4)
169 GMX_MM_LOAD_4JCOORD_1ATOM_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx1,jy1,jz1);
171 dx11 = _mm_sub_ps(ix1,jx1);
172 dy11 = _mm_sub_ps(iy1,jy1);
173 dz11 = _mm_sub_ps(iz1,jz1);
175 rsq11 = gmx_mm_scalarprod_ps(dx11,dy11,dz11);
178 rinv11 = gmx_mm_invsqrt_ps(rsq11);
180 /***********************************/
181 /* INTERACTION SECTION STARTS HERE */
182 /***********************************/
184 fscal11 = _mm_add_ps(gmx_mm_int_coulomb_ps(rinv11,qqOO,&vctot),
185 gmx_mm_int_lj_ps(rinv11,c6,c12,&Vvdwtot));
186 fscal11 = _mm_mul_ps(fscal11,_mm_mul_ps(rinv11,rinv11)); /* *=rinvsq */
188 /***********************************/
189 /* INTERACTION SECTION ENDS HERE */
190 /***********************************/
192 /* Calculate vectorial forces from fscal */
194 tx = _mm_mul_ps(fscal11,dx11);
195 ty = _mm_mul_ps(fscal11,dy11);
196 tz = _mm_mul_ps(fscal11,dz11);
197 fix1 = _mm_add_ps(fix1,tx);
198 fiy1 = _mm_add_ps(fiy1,ty);
199 fiz1 = _mm_add_ps(fiz1,tz);
204 /* Store j forces back */
205 GMX_MM_UPDATE_4JCOORD_1ATOM_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,fjx1,fjy1,fjz1);
208 /* Treat offset particles here */
220 GMX_MM_LOAD_3JCOORD_1ATOM_PS(pos+j3A,pos+j3B,pos+j3C,jx1,jy1,jz1);
228 GMX_MM_LOAD_2JCOORD_1ATOM_PS(pos+j3A,pos+j3B,jx1,jy1,jz1);
234 GMX_MM_LOAD_1JCOORD_1ATOM_PS(pos+j3A,jx1,jy1,jz1);
237 dx11 = _mm_sub_ps(ix1,jx1);
238 dy11 = _mm_sub_ps(iy1,jy1);
239 dz11 = _mm_sub_ps(iz1,jz1);
241 rsq11 = gmx_mm_scalarprod_ps(dx11,dy11,dz11);
244 rinv11 = _mm_and_ps(mask,gmx_mm_invsqrt_ps(rsq11));
246 /***********************************/
247 /* INTERACTION SECTION STARTS HERE */
248 /***********************************/
250 fscal11 = _mm_add_ps(gmx_mm_int_coulomb_ps(rinv11,qqOO,&vctot),
251 gmx_mm_int_lj_ps(rinv11,c6,c12,&Vvdwtot));
252 fscal11 = _mm_mul_ps(fscal11,_mm_mul_ps(rinv11,rinv11)); /* *=rinvsq */
254 /***********************************/
255 /* INTERACTION SECTION ENDS HERE */
256 /***********************************/
258 /* Calculate vectorial forces */
259 tx = _mm_mul_ps(fscal11,dx11);
260 ty = _mm_mul_ps(fscal11,dy11);
261 tz = _mm_mul_ps(fscal11,dz11);
262 fix1 = _mm_add_ps(fix1,tx);
263 fiy1 = _mm_add_ps(fiy1,ty);
264 fiz1 = _mm_add_ps(fiz1,tz);
271 GMX_MM_UPDATE_3JCOORD_1ATOM_PS(faction+j3A,faction+j3B,faction+j3C,fjx1,fjy1,fjz1);
275 GMX_MM_UPDATE_2JCOORD_1ATOM_PS(faction+j3A,faction+j3B,fjx1,fjy1,fjz1);
279 GMX_MM_UPDATE_1JCOORD_1ATOM_PS(faction+j3A,fjx1,fjy1,fjz1);
283 gmx_mm_update_iforce_1atom_ps(&fix1,&fiy1,&fiz1,faction+ii3,fshift+is3);
286 GMX_MM_UPDATE_2POT_PS(vctot,Vc+ggid,Vvdwtot,Vvdw+ggid);
294 int nb_kernel110_sse2_single_dummy;