2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
97 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
100 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
104 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
106 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m128 dummy_mask,cutoff_mask;
109 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
110 __m128 one = _mm_set1_ps(1.0);
111 __m128 two = _mm_set1_ps(2.0);
117 jindex = nlist->jindex;
119 shiftidx = nlist->shift;
121 shiftvec = fr->shift_vec[0];
122 fshift = fr->fshift[0];
123 facel = _mm_set1_ps(fr->epsfac);
124 charge = mdatoms->chargeA;
125 nvdwtype = fr->ntype;
127 vdwtype = mdatoms->typeA;
129 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
130 ewtab = fr->ic->tabq_coul_FDV0;
131 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
132 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
134 /* Setup water-specific parameters */
135 inr = nlist->iinr[0];
136 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
139 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
141 /* Avoid stupid compiler warnings */
142 jnrA = jnrB = jnrC = jnrD = 0;
151 for(iidx=0;iidx<4*DIM;iidx++)
156 /* Start outer loop over neighborlists */
157 for(iidx=0; iidx<nri; iidx++)
159 /* Load shift vector for this list */
160 i_shift_offset = DIM*shiftidx[iidx];
162 /* Load limits for loop over neighbors */
163 j_index_start = jindex[iidx];
164 j_index_end = jindex[iidx+1];
166 /* Get outer coordinate index */
168 i_coord_offset = DIM*inr;
170 /* Load i particle coords and add shift vector */
171 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
172 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
174 fix0 = _mm_setzero_ps();
175 fiy0 = _mm_setzero_ps();
176 fiz0 = _mm_setzero_ps();
177 fix1 = _mm_setzero_ps();
178 fiy1 = _mm_setzero_ps();
179 fiz1 = _mm_setzero_ps();
180 fix2 = _mm_setzero_ps();
181 fiy2 = _mm_setzero_ps();
182 fiz2 = _mm_setzero_ps();
183 fix3 = _mm_setzero_ps();
184 fiy3 = _mm_setzero_ps();
185 fiz3 = _mm_setzero_ps();
187 /* Reset potential sums */
188 velecsum = _mm_setzero_ps();
189 vvdwsum = _mm_setzero_ps();
191 /* Start inner kernel loop */
192 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
195 /* Get j neighbor index, and coordinate index */
200 j_coord_offsetA = DIM*jnrA;
201 j_coord_offsetB = DIM*jnrB;
202 j_coord_offsetC = DIM*jnrC;
203 j_coord_offsetD = DIM*jnrD;
205 /* load j atom coordinates */
206 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
207 x+j_coord_offsetC,x+j_coord_offsetD,
210 /* Calculate displacement vector */
211 dx00 = _mm_sub_ps(ix0,jx0);
212 dy00 = _mm_sub_ps(iy0,jy0);
213 dz00 = _mm_sub_ps(iz0,jz0);
214 dx10 = _mm_sub_ps(ix1,jx0);
215 dy10 = _mm_sub_ps(iy1,jy0);
216 dz10 = _mm_sub_ps(iz1,jz0);
217 dx20 = _mm_sub_ps(ix2,jx0);
218 dy20 = _mm_sub_ps(iy2,jy0);
219 dz20 = _mm_sub_ps(iz2,jz0);
220 dx30 = _mm_sub_ps(ix3,jx0);
221 dy30 = _mm_sub_ps(iy3,jy0);
222 dz30 = _mm_sub_ps(iz3,jz0);
224 /* Calculate squared distance and things based on it */
225 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
226 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
227 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
228 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
230 rinv10 = gmx_mm_invsqrt_ps(rsq10);
231 rinv20 = gmx_mm_invsqrt_ps(rsq20);
232 rinv30 = gmx_mm_invsqrt_ps(rsq30);
234 rinvsq00 = gmx_mm_inv_ps(rsq00);
235 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
236 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
237 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
239 /* Load parameters for j particles */
240 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
241 charge+jnrC+0,charge+jnrD+0);
242 vdwjidx0A = 2*vdwtype[jnrA+0];
243 vdwjidx0B = 2*vdwtype[jnrB+0];
244 vdwjidx0C = 2*vdwtype[jnrC+0];
245 vdwjidx0D = 2*vdwtype[jnrD+0];
247 fjx0 = _mm_setzero_ps();
248 fjy0 = _mm_setzero_ps();
249 fjz0 = _mm_setzero_ps();
251 /**************************
252 * CALCULATE INTERACTIONS *
253 **************************/
255 /* Compute parameters for interactions between i and j atoms */
256 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
257 vdwparam+vdwioffset0+vdwjidx0B,
258 vdwparam+vdwioffset0+vdwjidx0C,
259 vdwparam+vdwioffset0+vdwjidx0D,
262 /* LENNARD-JONES DISPERSION/REPULSION */
264 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
265 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
266 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
267 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
268 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
270 /* Update potential sum for this i atom from the interaction with this j atom. */
271 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
275 /* Calculate temporary vectorial force */
276 tx = _mm_mul_ps(fscal,dx00);
277 ty = _mm_mul_ps(fscal,dy00);
278 tz = _mm_mul_ps(fscal,dz00);
280 /* Update vectorial force */
281 fix0 = _mm_add_ps(fix0,tx);
282 fiy0 = _mm_add_ps(fiy0,ty);
283 fiz0 = _mm_add_ps(fiz0,tz);
285 fjx0 = _mm_add_ps(fjx0,tx);
286 fjy0 = _mm_add_ps(fjy0,ty);
287 fjz0 = _mm_add_ps(fjz0,tz);
289 /**************************
290 * CALCULATE INTERACTIONS *
291 **************************/
293 r10 = _mm_mul_ps(rsq10,rinv10);
295 /* Compute parameters for interactions between i and j atoms */
296 qq10 = _mm_mul_ps(iq1,jq0);
298 /* EWALD ELECTROSTATICS */
300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
301 ewrt = _mm_mul_ps(r10,ewtabscale);
302 ewitab = _mm_cvttps_epi32(ewrt);
303 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
304 ewitab = _mm_slli_epi32(ewitab,2);
305 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
306 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
307 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
308 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
309 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
310 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
311 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
312 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
313 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
315 /* Update potential sum for this i atom from the interaction with this j atom. */
316 velecsum = _mm_add_ps(velecsum,velec);
320 /* Calculate temporary vectorial force */
321 tx = _mm_mul_ps(fscal,dx10);
322 ty = _mm_mul_ps(fscal,dy10);
323 tz = _mm_mul_ps(fscal,dz10);
325 /* Update vectorial force */
326 fix1 = _mm_add_ps(fix1,tx);
327 fiy1 = _mm_add_ps(fiy1,ty);
328 fiz1 = _mm_add_ps(fiz1,tz);
330 fjx0 = _mm_add_ps(fjx0,tx);
331 fjy0 = _mm_add_ps(fjy0,ty);
332 fjz0 = _mm_add_ps(fjz0,tz);
334 /**************************
335 * CALCULATE INTERACTIONS *
336 **************************/
338 r20 = _mm_mul_ps(rsq20,rinv20);
340 /* Compute parameters for interactions between i and j atoms */
341 qq20 = _mm_mul_ps(iq2,jq0);
343 /* EWALD ELECTROSTATICS */
345 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
346 ewrt = _mm_mul_ps(r20,ewtabscale);
347 ewitab = _mm_cvttps_epi32(ewrt);
348 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
349 ewitab = _mm_slli_epi32(ewitab,2);
350 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
351 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
352 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
353 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
354 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
355 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
356 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
357 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
358 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
360 /* Update potential sum for this i atom from the interaction with this j atom. */
361 velecsum = _mm_add_ps(velecsum,velec);
365 /* Calculate temporary vectorial force */
366 tx = _mm_mul_ps(fscal,dx20);
367 ty = _mm_mul_ps(fscal,dy20);
368 tz = _mm_mul_ps(fscal,dz20);
370 /* Update vectorial force */
371 fix2 = _mm_add_ps(fix2,tx);
372 fiy2 = _mm_add_ps(fiy2,ty);
373 fiz2 = _mm_add_ps(fiz2,tz);
375 fjx0 = _mm_add_ps(fjx0,tx);
376 fjy0 = _mm_add_ps(fjy0,ty);
377 fjz0 = _mm_add_ps(fjz0,tz);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 r30 = _mm_mul_ps(rsq30,rinv30);
385 /* Compute parameters for interactions between i and j atoms */
386 qq30 = _mm_mul_ps(iq3,jq0);
388 /* EWALD ELECTROSTATICS */
390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
391 ewrt = _mm_mul_ps(r30,ewtabscale);
392 ewitab = _mm_cvttps_epi32(ewrt);
393 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
394 ewitab = _mm_slli_epi32(ewitab,2);
395 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
396 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
397 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
398 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
399 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
400 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
401 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
402 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
403 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velecsum = _mm_add_ps(velecsum,velec);
410 /* Calculate temporary vectorial force */
411 tx = _mm_mul_ps(fscal,dx30);
412 ty = _mm_mul_ps(fscal,dy30);
413 tz = _mm_mul_ps(fscal,dz30);
415 /* Update vectorial force */
416 fix3 = _mm_add_ps(fix3,tx);
417 fiy3 = _mm_add_ps(fiy3,ty);
418 fiz3 = _mm_add_ps(fiz3,tz);
420 fjx0 = _mm_add_ps(fjx0,tx);
421 fjy0 = _mm_add_ps(fjy0,ty);
422 fjz0 = _mm_add_ps(fjz0,tz);
424 fjptrA = f+j_coord_offsetA;
425 fjptrB = f+j_coord_offsetB;
426 fjptrC = f+j_coord_offsetC;
427 fjptrD = f+j_coord_offsetD;
429 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
431 /* Inner loop uses 155 flops */
437 /* Get j neighbor index, and coordinate index */
438 jnrlistA = jjnr[jidx];
439 jnrlistB = jjnr[jidx+1];
440 jnrlistC = jjnr[jidx+2];
441 jnrlistD = jjnr[jidx+3];
442 /* Sign of each element will be negative for non-real atoms.
443 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
444 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
446 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
447 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
448 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
449 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
450 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
451 j_coord_offsetA = DIM*jnrA;
452 j_coord_offsetB = DIM*jnrB;
453 j_coord_offsetC = DIM*jnrC;
454 j_coord_offsetD = DIM*jnrD;
456 /* load j atom coordinates */
457 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
458 x+j_coord_offsetC,x+j_coord_offsetD,
461 /* Calculate displacement vector */
462 dx00 = _mm_sub_ps(ix0,jx0);
463 dy00 = _mm_sub_ps(iy0,jy0);
464 dz00 = _mm_sub_ps(iz0,jz0);
465 dx10 = _mm_sub_ps(ix1,jx0);
466 dy10 = _mm_sub_ps(iy1,jy0);
467 dz10 = _mm_sub_ps(iz1,jz0);
468 dx20 = _mm_sub_ps(ix2,jx0);
469 dy20 = _mm_sub_ps(iy2,jy0);
470 dz20 = _mm_sub_ps(iz2,jz0);
471 dx30 = _mm_sub_ps(ix3,jx0);
472 dy30 = _mm_sub_ps(iy3,jy0);
473 dz30 = _mm_sub_ps(iz3,jz0);
475 /* Calculate squared distance and things based on it */
476 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
477 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
478 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
479 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
481 rinv10 = gmx_mm_invsqrt_ps(rsq10);
482 rinv20 = gmx_mm_invsqrt_ps(rsq20);
483 rinv30 = gmx_mm_invsqrt_ps(rsq30);
485 rinvsq00 = gmx_mm_inv_ps(rsq00);
486 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
487 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
488 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
490 /* Load parameters for j particles */
491 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
492 charge+jnrC+0,charge+jnrD+0);
493 vdwjidx0A = 2*vdwtype[jnrA+0];
494 vdwjidx0B = 2*vdwtype[jnrB+0];
495 vdwjidx0C = 2*vdwtype[jnrC+0];
496 vdwjidx0D = 2*vdwtype[jnrD+0];
498 fjx0 = _mm_setzero_ps();
499 fjy0 = _mm_setzero_ps();
500 fjz0 = _mm_setzero_ps();
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 /* Compute parameters for interactions between i and j atoms */
507 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
508 vdwparam+vdwioffset0+vdwjidx0B,
509 vdwparam+vdwioffset0+vdwjidx0C,
510 vdwparam+vdwioffset0+vdwjidx0D,
513 /* LENNARD-JONES DISPERSION/REPULSION */
515 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
516 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
517 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
518 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
519 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
523 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
527 fscal = _mm_andnot_ps(dummy_mask,fscal);
529 /* Calculate temporary vectorial force */
530 tx = _mm_mul_ps(fscal,dx00);
531 ty = _mm_mul_ps(fscal,dy00);
532 tz = _mm_mul_ps(fscal,dz00);
534 /* Update vectorial force */
535 fix0 = _mm_add_ps(fix0,tx);
536 fiy0 = _mm_add_ps(fiy0,ty);
537 fiz0 = _mm_add_ps(fiz0,tz);
539 fjx0 = _mm_add_ps(fjx0,tx);
540 fjy0 = _mm_add_ps(fjy0,ty);
541 fjz0 = _mm_add_ps(fjz0,tz);
543 /**************************
544 * CALCULATE INTERACTIONS *
545 **************************/
547 r10 = _mm_mul_ps(rsq10,rinv10);
548 r10 = _mm_andnot_ps(dummy_mask,r10);
550 /* Compute parameters for interactions between i and j atoms */
551 qq10 = _mm_mul_ps(iq1,jq0);
553 /* EWALD ELECTROSTATICS */
555 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
556 ewrt = _mm_mul_ps(r10,ewtabscale);
557 ewitab = _mm_cvttps_epi32(ewrt);
558 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
559 ewitab = _mm_slli_epi32(ewitab,2);
560 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
561 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
562 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
563 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
564 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
565 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
566 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
567 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
568 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velec = _mm_andnot_ps(dummy_mask,velec);
572 velecsum = _mm_add_ps(velecsum,velec);
576 fscal = _mm_andnot_ps(dummy_mask,fscal);
578 /* Calculate temporary vectorial force */
579 tx = _mm_mul_ps(fscal,dx10);
580 ty = _mm_mul_ps(fscal,dy10);
581 tz = _mm_mul_ps(fscal,dz10);
583 /* Update vectorial force */
584 fix1 = _mm_add_ps(fix1,tx);
585 fiy1 = _mm_add_ps(fiy1,ty);
586 fiz1 = _mm_add_ps(fiz1,tz);
588 fjx0 = _mm_add_ps(fjx0,tx);
589 fjy0 = _mm_add_ps(fjy0,ty);
590 fjz0 = _mm_add_ps(fjz0,tz);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 r20 = _mm_mul_ps(rsq20,rinv20);
597 r20 = _mm_andnot_ps(dummy_mask,r20);
599 /* Compute parameters for interactions between i and j atoms */
600 qq20 = _mm_mul_ps(iq2,jq0);
602 /* EWALD ELECTROSTATICS */
604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
605 ewrt = _mm_mul_ps(r20,ewtabscale);
606 ewitab = _mm_cvttps_epi32(ewrt);
607 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
608 ewitab = _mm_slli_epi32(ewitab,2);
609 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
610 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
611 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
612 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
613 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
614 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
615 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
616 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
617 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
619 /* Update potential sum for this i atom from the interaction with this j atom. */
620 velec = _mm_andnot_ps(dummy_mask,velec);
621 velecsum = _mm_add_ps(velecsum,velec);
625 fscal = _mm_andnot_ps(dummy_mask,fscal);
627 /* Calculate temporary vectorial force */
628 tx = _mm_mul_ps(fscal,dx20);
629 ty = _mm_mul_ps(fscal,dy20);
630 tz = _mm_mul_ps(fscal,dz20);
632 /* Update vectorial force */
633 fix2 = _mm_add_ps(fix2,tx);
634 fiy2 = _mm_add_ps(fiy2,ty);
635 fiz2 = _mm_add_ps(fiz2,tz);
637 fjx0 = _mm_add_ps(fjx0,tx);
638 fjy0 = _mm_add_ps(fjy0,ty);
639 fjz0 = _mm_add_ps(fjz0,tz);
641 /**************************
642 * CALCULATE INTERACTIONS *
643 **************************/
645 r30 = _mm_mul_ps(rsq30,rinv30);
646 r30 = _mm_andnot_ps(dummy_mask,r30);
648 /* Compute parameters for interactions between i and j atoms */
649 qq30 = _mm_mul_ps(iq3,jq0);
651 /* EWALD ELECTROSTATICS */
653 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
654 ewrt = _mm_mul_ps(r30,ewtabscale);
655 ewitab = _mm_cvttps_epi32(ewrt);
656 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
657 ewitab = _mm_slli_epi32(ewitab,2);
658 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
659 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
660 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
661 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
662 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
663 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
664 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
665 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
666 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
668 /* Update potential sum for this i atom from the interaction with this j atom. */
669 velec = _mm_andnot_ps(dummy_mask,velec);
670 velecsum = _mm_add_ps(velecsum,velec);
674 fscal = _mm_andnot_ps(dummy_mask,fscal);
676 /* Calculate temporary vectorial force */
677 tx = _mm_mul_ps(fscal,dx30);
678 ty = _mm_mul_ps(fscal,dy30);
679 tz = _mm_mul_ps(fscal,dz30);
681 /* Update vectorial force */
682 fix3 = _mm_add_ps(fix3,tx);
683 fiy3 = _mm_add_ps(fiy3,ty);
684 fiz3 = _mm_add_ps(fiz3,tz);
686 fjx0 = _mm_add_ps(fjx0,tx);
687 fjy0 = _mm_add_ps(fjy0,ty);
688 fjz0 = _mm_add_ps(fjz0,tz);
690 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
691 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
692 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
693 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
695 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
697 /* Inner loop uses 158 flops */
700 /* End of innermost loop */
702 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
703 f+i_coord_offset,fshift+i_shift_offset);
706 /* Update potential energies */
707 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
708 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
710 /* Increment number of inner iterations */
711 inneriter += j_index_end - j_index_start;
713 /* Outer loop uses 26 flops */
716 /* Increment number of outer iterations */
719 /* Update outer/inner flops */
721 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*158);
724 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
725 * Electrostatics interaction: Ewald
726 * VdW interaction: LennardJones
727 * Geometry: Water4-Particle
728 * Calculate force/pot: Force
731 nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
732 (t_nblist * gmx_restrict nlist,
733 rvec * gmx_restrict xx,
734 rvec * gmx_restrict ff,
735 t_forcerec * gmx_restrict fr,
736 t_mdatoms * gmx_restrict mdatoms,
737 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
738 t_nrnb * gmx_restrict nrnb)
740 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
741 * just 0 for non-waters.
742 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
743 * jnr indices corresponding to data put in the four positions in the SIMD register.
745 int i_shift_offset,i_coord_offset,outeriter,inneriter;
746 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
747 int jnrA,jnrB,jnrC,jnrD;
748 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
749 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
750 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
752 real *shiftvec,*fshift,*x,*f;
753 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
755 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
757 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
759 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
761 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
763 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
764 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
765 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
766 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
767 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
768 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
769 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
770 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
773 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
776 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
777 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
779 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
781 __m128 dummy_mask,cutoff_mask;
782 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
783 __m128 one = _mm_set1_ps(1.0);
784 __m128 two = _mm_set1_ps(2.0);
790 jindex = nlist->jindex;
792 shiftidx = nlist->shift;
794 shiftvec = fr->shift_vec[0];
795 fshift = fr->fshift[0];
796 facel = _mm_set1_ps(fr->epsfac);
797 charge = mdatoms->chargeA;
798 nvdwtype = fr->ntype;
800 vdwtype = mdatoms->typeA;
802 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
803 ewtab = fr->ic->tabq_coul_F;
804 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
805 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
807 /* Setup water-specific parameters */
808 inr = nlist->iinr[0];
809 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
810 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
811 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
812 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
814 /* Avoid stupid compiler warnings */
815 jnrA = jnrB = jnrC = jnrD = 0;
824 for(iidx=0;iidx<4*DIM;iidx++)
829 /* Start outer loop over neighborlists */
830 for(iidx=0; iidx<nri; iidx++)
832 /* Load shift vector for this list */
833 i_shift_offset = DIM*shiftidx[iidx];
835 /* Load limits for loop over neighbors */
836 j_index_start = jindex[iidx];
837 j_index_end = jindex[iidx+1];
839 /* Get outer coordinate index */
841 i_coord_offset = DIM*inr;
843 /* Load i particle coords and add shift vector */
844 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
845 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
847 fix0 = _mm_setzero_ps();
848 fiy0 = _mm_setzero_ps();
849 fiz0 = _mm_setzero_ps();
850 fix1 = _mm_setzero_ps();
851 fiy1 = _mm_setzero_ps();
852 fiz1 = _mm_setzero_ps();
853 fix2 = _mm_setzero_ps();
854 fiy2 = _mm_setzero_ps();
855 fiz2 = _mm_setzero_ps();
856 fix3 = _mm_setzero_ps();
857 fiy3 = _mm_setzero_ps();
858 fiz3 = _mm_setzero_ps();
860 /* Start inner kernel loop */
861 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
864 /* Get j neighbor index, and coordinate index */
869 j_coord_offsetA = DIM*jnrA;
870 j_coord_offsetB = DIM*jnrB;
871 j_coord_offsetC = DIM*jnrC;
872 j_coord_offsetD = DIM*jnrD;
874 /* load j atom coordinates */
875 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
876 x+j_coord_offsetC,x+j_coord_offsetD,
879 /* Calculate displacement vector */
880 dx00 = _mm_sub_ps(ix0,jx0);
881 dy00 = _mm_sub_ps(iy0,jy0);
882 dz00 = _mm_sub_ps(iz0,jz0);
883 dx10 = _mm_sub_ps(ix1,jx0);
884 dy10 = _mm_sub_ps(iy1,jy0);
885 dz10 = _mm_sub_ps(iz1,jz0);
886 dx20 = _mm_sub_ps(ix2,jx0);
887 dy20 = _mm_sub_ps(iy2,jy0);
888 dz20 = _mm_sub_ps(iz2,jz0);
889 dx30 = _mm_sub_ps(ix3,jx0);
890 dy30 = _mm_sub_ps(iy3,jy0);
891 dz30 = _mm_sub_ps(iz3,jz0);
893 /* Calculate squared distance and things based on it */
894 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
895 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
896 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
897 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
899 rinv10 = gmx_mm_invsqrt_ps(rsq10);
900 rinv20 = gmx_mm_invsqrt_ps(rsq20);
901 rinv30 = gmx_mm_invsqrt_ps(rsq30);
903 rinvsq00 = gmx_mm_inv_ps(rsq00);
904 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
905 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
906 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
908 /* Load parameters for j particles */
909 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
910 charge+jnrC+0,charge+jnrD+0);
911 vdwjidx0A = 2*vdwtype[jnrA+0];
912 vdwjidx0B = 2*vdwtype[jnrB+0];
913 vdwjidx0C = 2*vdwtype[jnrC+0];
914 vdwjidx0D = 2*vdwtype[jnrD+0];
916 fjx0 = _mm_setzero_ps();
917 fjy0 = _mm_setzero_ps();
918 fjz0 = _mm_setzero_ps();
920 /**************************
921 * CALCULATE INTERACTIONS *
922 **************************/
924 /* Compute parameters for interactions between i and j atoms */
925 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
926 vdwparam+vdwioffset0+vdwjidx0B,
927 vdwparam+vdwioffset0+vdwjidx0C,
928 vdwparam+vdwioffset0+vdwjidx0D,
931 /* LENNARD-JONES DISPERSION/REPULSION */
933 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
934 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
938 /* Calculate temporary vectorial force */
939 tx = _mm_mul_ps(fscal,dx00);
940 ty = _mm_mul_ps(fscal,dy00);
941 tz = _mm_mul_ps(fscal,dz00);
943 /* Update vectorial force */
944 fix0 = _mm_add_ps(fix0,tx);
945 fiy0 = _mm_add_ps(fiy0,ty);
946 fiz0 = _mm_add_ps(fiz0,tz);
948 fjx0 = _mm_add_ps(fjx0,tx);
949 fjy0 = _mm_add_ps(fjy0,ty);
950 fjz0 = _mm_add_ps(fjz0,tz);
952 /**************************
953 * CALCULATE INTERACTIONS *
954 **************************/
956 r10 = _mm_mul_ps(rsq10,rinv10);
958 /* Compute parameters for interactions between i and j atoms */
959 qq10 = _mm_mul_ps(iq1,jq0);
961 /* EWALD ELECTROSTATICS */
963 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
964 ewrt = _mm_mul_ps(r10,ewtabscale);
965 ewitab = _mm_cvttps_epi32(ewrt);
966 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
967 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
968 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
970 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
971 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
975 /* Calculate temporary vectorial force */
976 tx = _mm_mul_ps(fscal,dx10);
977 ty = _mm_mul_ps(fscal,dy10);
978 tz = _mm_mul_ps(fscal,dz10);
980 /* Update vectorial force */
981 fix1 = _mm_add_ps(fix1,tx);
982 fiy1 = _mm_add_ps(fiy1,ty);
983 fiz1 = _mm_add_ps(fiz1,tz);
985 fjx0 = _mm_add_ps(fjx0,tx);
986 fjy0 = _mm_add_ps(fjy0,ty);
987 fjz0 = _mm_add_ps(fjz0,tz);
989 /**************************
990 * CALCULATE INTERACTIONS *
991 **************************/
993 r20 = _mm_mul_ps(rsq20,rinv20);
995 /* Compute parameters for interactions between i and j atoms */
996 qq20 = _mm_mul_ps(iq2,jq0);
998 /* EWALD ELECTROSTATICS */
1000 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1001 ewrt = _mm_mul_ps(r20,ewtabscale);
1002 ewitab = _mm_cvttps_epi32(ewrt);
1003 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1004 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1005 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1007 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1008 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1012 /* Calculate temporary vectorial force */
1013 tx = _mm_mul_ps(fscal,dx20);
1014 ty = _mm_mul_ps(fscal,dy20);
1015 tz = _mm_mul_ps(fscal,dz20);
1017 /* Update vectorial force */
1018 fix2 = _mm_add_ps(fix2,tx);
1019 fiy2 = _mm_add_ps(fiy2,ty);
1020 fiz2 = _mm_add_ps(fiz2,tz);
1022 fjx0 = _mm_add_ps(fjx0,tx);
1023 fjy0 = _mm_add_ps(fjy0,ty);
1024 fjz0 = _mm_add_ps(fjz0,tz);
1026 /**************************
1027 * CALCULATE INTERACTIONS *
1028 **************************/
1030 r30 = _mm_mul_ps(rsq30,rinv30);
1032 /* Compute parameters for interactions between i and j atoms */
1033 qq30 = _mm_mul_ps(iq3,jq0);
1035 /* EWALD ELECTROSTATICS */
1037 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1038 ewrt = _mm_mul_ps(r30,ewtabscale);
1039 ewitab = _mm_cvttps_epi32(ewrt);
1040 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1041 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1042 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1044 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1045 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1049 /* Calculate temporary vectorial force */
1050 tx = _mm_mul_ps(fscal,dx30);
1051 ty = _mm_mul_ps(fscal,dy30);
1052 tz = _mm_mul_ps(fscal,dz30);
1054 /* Update vectorial force */
1055 fix3 = _mm_add_ps(fix3,tx);
1056 fiy3 = _mm_add_ps(fiy3,ty);
1057 fiz3 = _mm_add_ps(fiz3,tz);
1059 fjx0 = _mm_add_ps(fjx0,tx);
1060 fjy0 = _mm_add_ps(fjy0,ty);
1061 fjz0 = _mm_add_ps(fjz0,tz);
1063 fjptrA = f+j_coord_offsetA;
1064 fjptrB = f+j_coord_offsetB;
1065 fjptrC = f+j_coord_offsetC;
1066 fjptrD = f+j_coord_offsetD;
1068 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1070 /* Inner loop uses 135 flops */
1073 if(jidx<j_index_end)
1076 /* Get j neighbor index, and coordinate index */
1077 jnrlistA = jjnr[jidx];
1078 jnrlistB = jjnr[jidx+1];
1079 jnrlistC = jjnr[jidx+2];
1080 jnrlistD = jjnr[jidx+3];
1081 /* Sign of each element will be negative for non-real atoms.
1082 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1083 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1085 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1086 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1087 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1088 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1089 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1090 j_coord_offsetA = DIM*jnrA;
1091 j_coord_offsetB = DIM*jnrB;
1092 j_coord_offsetC = DIM*jnrC;
1093 j_coord_offsetD = DIM*jnrD;
1095 /* load j atom coordinates */
1096 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1097 x+j_coord_offsetC,x+j_coord_offsetD,
1100 /* Calculate displacement vector */
1101 dx00 = _mm_sub_ps(ix0,jx0);
1102 dy00 = _mm_sub_ps(iy0,jy0);
1103 dz00 = _mm_sub_ps(iz0,jz0);
1104 dx10 = _mm_sub_ps(ix1,jx0);
1105 dy10 = _mm_sub_ps(iy1,jy0);
1106 dz10 = _mm_sub_ps(iz1,jz0);
1107 dx20 = _mm_sub_ps(ix2,jx0);
1108 dy20 = _mm_sub_ps(iy2,jy0);
1109 dz20 = _mm_sub_ps(iz2,jz0);
1110 dx30 = _mm_sub_ps(ix3,jx0);
1111 dy30 = _mm_sub_ps(iy3,jy0);
1112 dz30 = _mm_sub_ps(iz3,jz0);
1114 /* Calculate squared distance and things based on it */
1115 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1116 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1117 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1118 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1120 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1121 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1122 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1124 rinvsq00 = gmx_mm_inv_ps(rsq00);
1125 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1126 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1127 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1129 /* Load parameters for j particles */
1130 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1131 charge+jnrC+0,charge+jnrD+0);
1132 vdwjidx0A = 2*vdwtype[jnrA+0];
1133 vdwjidx0B = 2*vdwtype[jnrB+0];
1134 vdwjidx0C = 2*vdwtype[jnrC+0];
1135 vdwjidx0D = 2*vdwtype[jnrD+0];
1137 fjx0 = _mm_setzero_ps();
1138 fjy0 = _mm_setzero_ps();
1139 fjz0 = _mm_setzero_ps();
1141 /**************************
1142 * CALCULATE INTERACTIONS *
1143 **************************/
1145 /* Compute parameters for interactions between i and j atoms */
1146 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1147 vdwparam+vdwioffset0+vdwjidx0B,
1148 vdwparam+vdwioffset0+vdwjidx0C,
1149 vdwparam+vdwioffset0+vdwjidx0D,
1152 /* LENNARD-JONES DISPERSION/REPULSION */
1154 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1155 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1159 fscal = _mm_andnot_ps(dummy_mask,fscal);
1161 /* Calculate temporary vectorial force */
1162 tx = _mm_mul_ps(fscal,dx00);
1163 ty = _mm_mul_ps(fscal,dy00);
1164 tz = _mm_mul_ps(fscal,dz00);
1166 /* Update vectorial force */
1167 fix0 = _mm_add_ps(fix0,tx);
1168 fiy0 = _mm_add_ps(fiy0,ty);
1169 fiz0 = _mm_add_ps(fiz0,tz);
1171 fjx0 = _mm_add_ps(fjx0,tx);
1172 fjy0 = _mm_add_ps(fjy0,ty);
1173 fjz0 = _mm_add_ps(fjz0,tz);
1175 /**************************
1176 * CALCULATE INTERACTIONS *
1177 **************************/
1179 r10 = _mm_mul_ps(rsq10,rinv10);
1180 r10 = _mm_andnot_ps(dummy_mask,r10);
1182 /* Compute parameters for interactions between i and j atoms */
1183 qq10 = _mm_mul_ps(iq1,jq0);
1185 /* EWALD ELECTROSTATICS */
1187 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1188 ewrt = _mm_mul_ps(r10,ewtabscale);
1189 ewitab = _mm_cvttps_epi32(ewrt);
1190 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1191 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1192 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1194 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1195 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1199 fscal = _mm_andnot_ps(dummy_mask,fscal);
1201 /* Calculate temporary vectorial force */
1202 tx = _mm_mul_ps(fscal,dx10);
1203 ty = _mm_mul_ps(fscal,dy10);
1204 tz = _mm_mul_ps(fscal,dz10);
1206 /* Update vectorial force */
1207 fix1 = _mm_add_ps(fix1,tx);
1208 fiy1 = _mm_add_ps(fiy1,ty);
1209 fiz1 = _mm_add_ps(fiz1,tz);
1211 fjx0 = _mm_add_ps(fjx0,tx);
1212 fjy0 = _mm_add_ps(fjy0,ty);
1213 fjz0 = _mm_add_ps(fjz0,tz);
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1219 r20 = _mm_mul_ps(rsq20,rinv20);
1220 r20 = _mm_andnot_ps(dummy_mask,r20);
1222 /* Compute parameters for interactions between i and j atoms */
1223 qq20 = _mm_mul_ps(iq2,jq0);
1225 /* EWALD ELECTROSTATICS */
1227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1228 ewrt = _mm_mul_ps(r20,ewtabscale);
1229 ewitab = _mm_cvttps_epi32(ewrt);
1230 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1231 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1232 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1234 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1235 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1239 fscal = _mm_andnot_ps(dummy_mask,fscal);
1241 /* Calculate temporary vectorial force */
1242 tx = _mm_mul_ps(fscal,dx20);
1243 ty = _mm_mul_ps(fscal,dy20);
1244 tz = _mm_mul_ps(fscal,dz20);
1246 /* Update vectorial force */
1247 fix2 = _mm_add_ps(fix2,tx);
1248 fiy2 = _mm_add_ps(fiy2,ty);
1249 fiz2 = _mm_add_ps(fiz2,tz);
1251 fjx0 = _mm_add_ps(fjx0,tx);
1252 fjy0 = _mm_add_ps(fjy0,ty);
1253 fjz0 = _mm_add_ps(fjz0,tz);
1255 /**************************
1256 * CALCULATE INTERACTIONS *
1257 **************************/
1259 r30 = _mm_mul_ps(rsq30,rinv30);
1260 r30 = _mm_andnot_ps(dummy_mask,r30);
1262 /* Compute parameters for interactions between i and j atoms */
1263 qq30 = _mm_mul_ps(iq3,jq0);
1265 /* EWALD ELECTROSTATICS */
1267 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268 ewrt = _mm_mul_ps(r30,ewtabscale);
1269 ewitab = _mm_cvttps_epi32(ewrt);
1270 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1271 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1272 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1274 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1275 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1279 fscal = _mm_andnot_ps(dummy_mask,fscal);
1281 /* Calculate temporary vectorial force */
1282 tx = _mm_mul_ps(fscal,dx30);
1283 ty = _mm_mul_ps(fscal,dy30);
1284 tz = _mm_mul_ps(fscal,dz30);
1286 /* Update vectorial force */
1287 fix3 = _mm_add_ps(fix3,tx);
1288 fiy3 = _mm_add_ps(fiy3,ty);
1289 fiz3 = _mm_add_ps(fiz3,tz);
1291 fjx0 = _mm_add_ps(fjx0,tx);
1292 fjy0 = _mm_add_ps(fjy0,ty);
1293 fjz0 = _mm_add_ps(fjz0,tz);
1295 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1296 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1297 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1298 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1300 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1302 /* Inner loop uses 138 flops */
1305 /* End of innermost loop */
1307 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1308 f+i_coord_offset,fshift+i_shift_offset);
1310 /* Increment number of inner iterations */
1311 inneriter += j_index_end - j_index_start;
1313 /* Outer loop uses 24 flops */
1316 /* Increment number of outer iterations */
1319 /* Update outer/inner flops */
1321 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*138);