2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/math/vec.h"
49 #include "gromacs/simd/math_x86_sse4_1_double.h"
50 #include "kernelutil_x86_sse4_1_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: None
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse4_1_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
91 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
92 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
93 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
99 real rswitch_scalar,d_scalar;
100 __m128d dummy_mask,cutoff_mask;
101 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
102 __m128d one = _mm_set1_pd(1.0);
103 __m128d two = _mm_set1_pd(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
118 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
119 ewtab = fr->ic->tabq_coul_FDV0;
120 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
121 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
126 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
127 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
129 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
130 rcutoff_scalar = fr->rcoulomb;
131 rcutoff = _mm_set1_pd(rcutoff_scalar);
132 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
134 rswitch_scalar = fr->rcoulomb_switch;
135 rswitch = _mm_set1_pd(rswitch_scalar);
136 /* Setup switch parameters */
137 d_scalar = rcutoff_scalar-rswitch_scalar;
138 d = _mm_set1_pd(d_scalar);
139 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
140 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
141 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
142 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
143 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
144 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
170 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
172 fix1 = _mm_setzero_pd();
173 fiy1 = _mm_setzero_pd();
174 fiz1 = _mm_setzero_pd();
175 fix2 = _mm_setzero_pd();
176 fiy2 = _mm_setzero_pd();
177 fiz2 = _mm_setzero_pd();
178 fix3 = _mm_setzero_pd();
179 fiy3 = _mm_setzero_pd();
180 fiz3 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
185 /* Start inner kernel loop */
186 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
189 /* Get j neighbor index, and coordinate index */
192 j_coord_offsetA = DIM*jnrA;
193 j_coord_offsetB = DIM*jnrB;
195 /* load j atom coordinates */
196 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
199 /* Calculate displacement vector */
200 dx10 = _mm_sub_pd(ix1,jx0);
201 dy10 = _mm_sub_pd(iy1,jy0);
202 dz10 = _mm_sub_pd(iz1,jz0);
203 dx20 = _mm_sub_pd(ix2,jx0);
204 dy20 = _mm_sub_pd(iy2,jy0);
205 dz20 = _mm_sub_pd(iz2,jz0);
206 dx30 = _mm_sub_pd(ix3,jx0);
207 dy30 = _mm_sub_pd(iy3,jy0);
208 dz30 = _mm_sub_pd(iz3,jz0);
210 /* Calculate squared distance and things based on it */
211 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
212 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
213 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
215 rinv10 = gmx_mm_invsqrt_pd(rsq10);
216 rinv20 = gmx_mm_invsqrt_pd(rsq20);
217 rinv30 = gmx_mm_invsqrt_pd(rsq30);
219 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
220 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
221 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
223 /* Load parameters for j particles */
224 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
226 fjx0 = _mm_setzero_pd();
227 fjy0 = _mm_setzero_pd();
228 fjz0 = _mm_setzero_pd();
230 /**************************
231 * CALCULATE INTERACTIONS *
232 **************************/
234 if (gmx_mm_any_lt(rsq10,rcutoff2))
237 r10 = _mm_mul_pd(rsq10,rinv10);
239 /* Compute parameters for interactions between i and j atoms */
240 qq10 = _mm_mul_pd(iq1,jq0);
242 /* EWALD ELECTROSTATICS */
244 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
245 ewrt = _mm_mul_pd(r10,ewtabscale);
246 ewitab = _mm_cvttpd_epi32(ewrt);
247 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
248 ewitab = _mm_slli_epi32(ewitab,2);
249 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
250 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
251 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
252 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
253 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
254 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
255 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
256 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
257 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
258 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
260 d = _mm_sub_pd(r10,rswitch);
261 d = _mm_max_pd(d,_mm_setzero_pd());
262 d2 = _mm_mul_pd(d,d);
263 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
265 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
267 /* Evaluate switch function */
268 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
269 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
270 velec = _mm_mul_pd(velec,sw);
271 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
273 /* Update potential sum for this i atom from the interaction with this j atom. */
274 velec = _mm_and_pd(velec,cutoff_mask);
275 velecsum = _mm_add_pd(velecsum,velec);
279 fscal = _mm_and_pd(fscal,cutoff_mask);
281 /* Calculate temporary vectorial force */
282 tx = _mm_mul_pd(fscal,dx10);
283 ty = _mm_mul_pd(fscal,dy10);
284 tz = _mm_mul_pd(fscal,dz10);
286 /* Update vectorial force */
287 fix1 = _mm_add_pd(fix1,tx);
288 fiy1 = _mm_add_pd(fiy1,ty);
289 fiz1 = _mm_add_pd(fiz1,tz);
291 fjx0 = _mm_add_pd(fjx0,tx);
292 fjy0 = _mm_add_pd(fjy0,ty);
293 fjz0 = _mm_add_pd(fjz0,tz);
297 /**************************
298 * CALCULATE INTERACTIONS *
299 **************************/
301 if (gmx_mm_any_lt(rsq20,rcutoff2))
304 r20 = _mm_mul_pd(rsq20,rinv20);
306 /* Compute parameters for interactions between i and j atoms */
307 qq20 = _mm_mul_pd(iq2,jq0);
309 /* EWALD ELECTROSTATICS */
311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
312 ewrt = _mm_mul_pd(r20,ewtabscale);
313 ewitab = _mm_cvttpd_epi32(ewrt);
314 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
315 ewitab = _mm_slli_epi32(ewitab,2);
316 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
317 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
318 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
319 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
320 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
321 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
322 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
323 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
324 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
325 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
327 d = _mm_sub_pd(r20,rswitch);
328 d = _mm_max_pd(d,_mm_setzero_pd());
329 d2 = _mm_mul_pd(d,d);
330 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
332 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
334 /* Evaluate switch function */
335 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
336 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
337 velec = _mm_mul_pd(velec,sw);
338 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
340 /* Update potential sum for this i atom from the interaction with this j atom. */
341 velec = _mm_and_pd(velec,cutoff_mask);
342 velecsum = _mm_add_pd(velecsum,velec);
346 fscal = _mm_and_pd(fscal,cutoff_mask);
348 /* Calculate temporary vectorial force */
349 tx = _mm_mul_pd(fscal,dx20);
350 ty = _mm_mul_pd(fscal,dy20);
351 tz = _mm_mul_pd(fscal,dz20);
353 /* Update vectorial force */
354 fix2 = _mm_add_pd(fix2,tx);
355 fiy2 = _mm_add_pd(fiy2,ty);
356 fiz2 = _mm_add_pd(fiz2,tz);
358 fjx0 = _mm_add_pd(fjx0,tx);
359 fjy0 = _mm_add_pd(fjy0,ty);
360 fjz0 = _mm_add_pd(fjz0,tz);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 if (gmx_mm_any_lt(rsq30,rcutoff2))
371 r30 = _mm_mul_pd(rsq30,rinv30);
373 /* Compute parameters for interactions between i and j atoms */
374 qq30 = _mm_mul_pd(iq3,jq0);
376 /* EWALD ELECTROSTATICS */
378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
379 ewrt = _mm_mul_pd(r30,ewtabscale);
380 ewitab = _mm_cvttpd_epi32(ewrt);
381 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
382 ewitab = _mm_slli_epi32(ewitab,2);
383 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
384 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
385 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
386 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
387 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
388 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
389 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
390 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
391 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
392 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
394 d = _mm_sub_pd(r30,rswitch);
395 d = _mm_max_pd(d,_mm_setzero_pd());
396 d2 = _mm_mul_pd(d,d);
397 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
399 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
401 /* Evaluate switch function */
402 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
403 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
404 velec = _mm_mul_pd(velec,sw);
405 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velec = _mm_and_pd(velec,cutoff_mask);
409 velecsum = _mm_add_pd(velecsum,velec);
413 fscal = _mm_and_pd(fscal,cutoff_mask);
415 /* Calculate temporary vectorial force */
416 tx = _mm_mul_pd(fscal,dx30);
417 ty = _mm_mul_pd(fscal,dy30);
418 tz = _mm_mul_pd(fscal,dz30);
420 /* Update vectorial force */
421 fix3 = _mm_add_pd(fix3,tx);
422 fiy3 = _mm_add_pd(fiy3,ty);
423 fiz3 = _mm_add_pd(fiz3,tz);
425 fjx0 = _mm_add_pd(fjx0,tx);
426 fjy0 = _mm_add_pd(fjy0,ty);
427 fjz0 = _mm_add_pd(fjz0,tz);
431 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
433 /* Inner loop uses 198 flops */
440 j_coord_offsetA = DIM*jnrA;
442 /* load j atom coordinates */
443 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
446 /* Calculate displacement vector */
447 dx10 = _mm_sub_pd(ix1,jx0);
448 dy10 = _mm_sub_pd(iy1,jy0);
449 dz10 = _mm_sub_pd(iz1,jz0);
450 dx20 = _mm_sub_pd(ix2,jx0);
451 dy20 = _mm_sub_pd(iy2,jy0);
452 dz20 = _mm_sub_pd(iz2,jz0);
453 dx30 = _mm_sub_pd(ix3,jx0);
454 dy30 = _mm_sub_pd(iy3,jy0);
455 dz30 = _mm_sub_pd(iz3,jz0);
457 /* Calculate squared distance and things based on it */
458 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
459 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
460 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
462 rinv10 = gmx_mm_invsqrt_pd(rsq10);
463 rinv20 = gmx_mm_invsqrt_pd(rsq20);
464 rinv30 = gmx_mm_invsqrt_pd(rsq30);
466 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
467 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
468 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
470 /* Load parameters for j particles */
471 jq0 = _mm_load_sd(charge+jnrA+0);
473 fjx0 = _mm_setzero_pd();
474 fjy0 = _mm_setzero_pd();
475 fjz0 = _mm_setzero_pd();
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 if (gmx_mm_any_lt(rsq10,rcutoff2))
484 r10 = _mm_mul_pd(rsq10,rinv10);
486 /* Compute parameters for interactions between i and j atoms */
487 qq10 = _mm_mul_pd(iq1,jq0);
489 /* EWALD ELECTROSTATICS */
491 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
492 ewrt = _mm_mul_pd(r10,ewtabscale);
493 ewitab = _mm_cvttpd_epi32(ewrt);
494 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
495 ewitab = _mm_slli_epi32(ewitab,2);
496 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
497 ewtabD = _mm_setzero_pd();
498 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
499 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
500 ewtabFn = _mm_setzero_pd();
501 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
502 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
503 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
504 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
505 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
507 d = _mm_sub_pd(r10,rswitch);
508 d = _mm_max_pd(d,_mm_setzero_pd());
509 d2 = _mm_mul_pd(d,d);
510 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
512 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
514 /* Evaluate switch function */
515 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
516 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
517 velec = _mm_mul_pd(velec,sw);
518 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
520 /* Update potential sum for this i atom from the interaction with this j atom. */
521 velec = _mm_and_pd(velec,cutoff_mask);
522 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
523 velecsum = _mm_add_pd(velecsum,velec);
527 fscal = _mm_and_pd(fscal,cutoff_mask);
529 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
531 /* Calculate temporary vectorial force */
532 tx = _mm_mul_pd(fscal,dx10);
533 ty = _mm_mul_pd(fscal,dy10);
534 tz = _mm_mul_pd(fscal,dz10);
536 /* Update vectorial force */
537 fix1 = _mm_add_pd(fix1,tx);
538 fiy1 = _mm_add_pd(fiy1,ty);
539 fiz1 = _mm_add_pd(fiz1,tz);
541 fjx0 = _mm_add_pd(fjx0,tx);
542 fjy0 = _mm_add_pd(fjy0,ty);
543 fjz0 = _mm_add_pd(fjz0,tz);
547 /**************************
548 * CALCULATE INTERACTIONS *
549 **************************/
551 if (gmx_mm_any_lt(rsq20,rcutoff2))
554 r20 = _mm_mul_pd(rsq20,rinv20);
556 /* Compute parameters for interactions between i and j atoms */
557 qq20 = _mm_mul_pd(iq2,jq0);
559 /* EWALD ELECTROSTATICS */
561 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
562 ewrt = _mm_mul_pd(r20,ewtabscale);
563 ewitab = _mm_cvttpd_epi32(ewrt);
564 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
565 ewitab = _mm_slli_epi32(ewitab,2);
566 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
567 ewtabD = _mm_setzero_pd();
568 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
569 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
570 ewtabFn = _mm_setzero_pd();
571 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
572 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
573 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
574 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
575 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
577 d = _mm_sub_pd(r20,rswitch);
578 d = _mm_max_pd(d,_mm_setzero_pd());
579 d2 = _mm_mul_pd(d,d);
580 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
582 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
584 /* Evaluate switch function */
585 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
586 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
587 velec = _mm_mul_pd(velec,sw);
588 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
590 /* Update potential sum for this i atom from the interaction with this j atom. */
591 velec = _mm_and_pd(velec,cutoff_mask);
592 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
593 velecsum = _mm_add_pd(velecsum,velec);
597 fscal = _mm_and_pd(fscal,cutoff_mask);
599 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
601 /* Calculate temporary vectorial force */
602 tx = _mm_mul_pd(fscal,dx20);
603 ty = _mm_mul_pd(fscal,dy20);
604 tz = _mm_mul_pd(fscal,dz20);
606 /* Update vectorial force */
607 fix2 = _mm_add_pd(fix2,tx);
608 fiy2 = _mm_add_pd(fiy2,ty);
609 fiz2 = _mm_add_pd(fiz2,tz);
611 fjx0 = _mm_add_pd(fjx0,tx);
612 fjy0 = _mm_add_pd(fjy0,ty);
613 fjz0 = _mm_add_pd(fjz0,tz);
617 /**************************
618 * CALCULATE INTERACTIONS *
619 **************************/
621 if (gmx_mm_any_lt(rsq30,rcutoff2))
624 r30 = _mm_mul_pd(rsq30,rinv30);
626 /* Compute parameters for interactions between i and j atoms */
627 qq30 = _mm_mul_pd(iq3,jq0);
629 /* EWALD ELECTROSTATICS */
631 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
632 ewrt = _mm_mul_pd(r30,ewtabscale);
633 ewitab = _mm_cvttpd_epi32(ewrt);
634 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
635 ewitab = _mm_slli_epi32(ewitab,2);
636 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
637 ewtabD = _mm_setzero_pd();
638 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
639 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
640 ewtabFn = _mm_setzero_pd();
641 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
642 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
643 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
644 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
645 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
647 d = _mm_sub_pd(r30,rswitch);
648 d = _mm_max_pd(d,_mm_setzero_pd());
649 d2 = _mm_mul_pd(d,d);
650 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
652 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
654 /* Evaluate switch function */
655 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
656 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
657 velec = _mm_mul_pd(velec,sw);
658 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 velec = _mm_and_pd(velec,cutoff_mask);
662 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
663 velecsum = _mm_add_pd(velecsum,velec);
667 fscal = _mm_and_pd(fscal,cutoff_mask);
669 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
671 /* Calculate temporary vectorial force */
672 tx = _mm_mul_pd(fscal,dx30);
673 ty = _mm_mul_pd(fscal,dy30);
674 tz = _mm_mul_pd(fscal,dz30);
676 /* Update vectorial force */
677 fix3 = _mm_add_pd(fix3,tx);
678 fiy3 = _mm_add_pd(fiy3,ty);
679 fiz3 = _mm_add_pd(fiz3,tz);
681 fjx0 = _mm_add_pd(fjx0,tx);
682 fjy0 = _mm_add_pd(fjy0,ty);
683 fjz0 = _mm_add_pd(fjz0,tz);
687 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
689 /* Inner loop uses 198 flops */
692 /* End of innermost loop */
694 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
695 f+i_coord_offset+DIM,fshift+i_shift_offset);
698 /* Update potential energies */
699 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
701 /* Increment number of inner iterations */
702 inneriter += j_index_end - j_index_start;
704 /* Outer loop uses 19 flops */
707 /* Increment number of outer iterations */
710 /* Update outer/inner flops */
712 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*198);
715 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_double
716 * Electrostatics interaction: Ewald
717 * VdW interaction: None
718 * Geometry: Water4-Particle
719 * Calculate force/pot: Force
722 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_double
723 (t_nblist * gmx_restrict nlist,
724 rvec * gmx_restrict xx,
725 rvec * gmx_restrict ff,
726 t_forcerec * gmx_restrict fr,
727 t_mdatoms * gmx_restrict mdatoms,
728 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
729 t_nrnb * gmx_restrict nrnb)
731 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
732 * just 0 for non-waters.
733 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
734 * jnr indices corresponding to data put in the four positions in the SIMD register.
736 int i_shift_offset,i_coord_offset,outeriter,inneriter;
737 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
739 int j_coord_offsetA,j_coord_offsetB;
740 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
742 real *shiftvec,*fshift,*x,*f;
743 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
745 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
747 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
749 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
750 int vdwjidx0A,vdwjidx0B;
751 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
752 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
753 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
754 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
755 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
758 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
760 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
761 real rswitch_scalar,d_scalar;
762 __m128d dummy_mask,cutoff_mask;
763 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
764 __m128d one = _mm_set1_pd(1.0);
765 __m128d two = _mm_set1_pd(2.0);
771 jindex = nlist->jindex;
773 shiftidx = nlist->shift;
775 shiftvec = fr->shift_vec[0];
776 fshift = fr->fshift[0];
777 facel = _mm_set1_pd(fr->epsfac);
778 charge = mdatoms->chargeA;
780 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
781 ewtab = fr->ic->tabq_coul_FDV0;
782 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
783 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
785 /* Setup water-specific parameters */
786 inr = nlist->iinr[0];
787 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
788 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
789 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
791 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
792 rcutoff_scalar = fr->rcoulomb;
793 rcutoff = _mm_set1_pd(rcutoff_scalar);
794 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
796 rswitch_scalar = fr->rcoulomb_switch;
797 rswitch = _mm_set1_pd(rswitch_scalar);
798 /* Setup switch parameters */
799 d_scalar = rcutoff_scalar-rswitch_scalar;
800 d = _mm_set1_pd(d_scalar);
801 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
802 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
803 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
804 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
805 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
806 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
808 /* Avoid stupid compiler warnings */
816 /* Start outer loop over neighborlists */
817 for(iidx=0; iidx<nri; iidx++)
819 /* Load shift vector for this list */
820 i_shift_offset = DIM*shiftidx[iidx];
822 /* Load limits for loop over neighbors */
823 j_index_start = jindex[iidx];
824 j_index_end = jindex[iidx+1];
826 /* Get outer coordinate index */
828 i_coord_offset = DIM*inr;
830 /* Load i particle coords and add shift vector */
831 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
832 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
834 fix1 = _mm_setzero_pd();
835 fiy1 = _mm_setzero_pd();
836 fiz1 = _mm_setzero_pd();
837 fix2 = _mm_setzero_pd();
838 fiy2 = _mm_setzero_pd();
839 fiz2 = _mm_setzero_pd();
840 fix3 = _mm_setzero_pd();
841 fiy3 = _mm_setzero_pd();
842 fiz3 = _mm_setzero_pd();
844 /* Start inner kernel loop */
845 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
848 /* Get j neighbor index, and coordinate index */
851 j_coord_offsetA = DIM*jnrA;
852 j_coord_offsetB = DIM*jnrB;
854 /* load j atom coordinates */
855 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
858 /* Calculate displacement vector */
859 dx10 = _mm_sub_pd(ix1,jx0);
860 dy10 = _mm_sub_pd(iy1,jy0);
861 dz10 = _mm_sub_pd(iz1,jz0);
862 dx20 = _mm_sub_pd(ix2,jx0);
863 dy20 = _mm_sub_pd(iy2,jy0);
864 dz20 = _mm_sub_pd(iz2,jz0);
865 dx30 = _mm_sub_pd(ix3,jx0);
866 dy30 = _mm_sub_pd(iy3,jy0);
867 dz30 = _mm_sub_pd(iz3,jz0);
869 /* Calculate squared distance and things based on it */
870 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
871 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
872 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
874 rinv10 = gmx_mm_invsqrt_pd(rsq10);
875 rinv20 = gmx_mm_invsqrt_pd(rsq20);
876 rinv30 = gmx_mm_invsqrt_pd(rsq30);
878 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
879 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
880 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
882 /* Load parameters for j particles */
883 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
885 fjx0 = _mm_setzero_pd();
886 fjy0 = _mm_setzero_pd();
887 fjz0 = _mm_setzero_pd();
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 if (gmx_mm_any_lt(rsq10,rcutoff2))
896 r10 = _mm_mul_pd(rsq10,rinv10);
898 /* Compute parameters for interactions between i and j atoms */
899 qq10 = _mm_mul_pd(iq1,jq0);
901 /* EWALD ELECTROSTATICS */
903 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
904 ewrt = _mm_mul_pd(r10,ewtabscale);
905 ewitab = _mm_cvttpd_epi32(ewrt);
906 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
907 ewitab = _mm_slli_epi32(ewitab,2);
908 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
909 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
910 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
911 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
912 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
913 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
914 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
915 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
916 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
917 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
919 d = _mm_sub_pd(r10,rswitch);
920 d = _mm_max_pd(d,_mm_setzero_pd());
921 d2 = _mm_mul_pd(d,d);
922 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
924 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
926 /* Evaluate switch function */
927 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
928 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
929 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
933 fscal = _mm_and_pd(fscal,cutoff_mask);
935 /* Calculate temporary vectorial force */
936 tx = _mm_mul_pd(fscal,dx10);
937 ty = _mm_mul_pd(fscal,dy10);
938 tz = _mm_mul_pd(fscal,dz10);
940 /* Update vectorial force */
941 fix1 = _mm_add_pd(fix1,tx);
942 fiy1 = _mm_add_pd(fiy1,ty);
943 fiz1 = _mm_add_pd(fiz1,tz);
945 fjx0 = _mm_add_pd(fjx0,tx);
946 fjy0 = _mm_add_pd(fjy0,ty);
947 fjz0 = _mm_add_pd(fjz0,tz);
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 if (gmx_mm_any_lt(rsq20,rcutoff2))
958 r20 = _mm_mul_pd(rsq20,rinv20);
960 /* Compute parameters for interactions between i and j atoms */
961 qq20 = _mm_mul_pd(iq2,jq0);
963 /* EWALD ELECTROSTATICS */
965 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
966 ewrt = _mm_mul_pd(r20,ewtabscale);
967 ewitab = _mm_cvttpd_epi32(ewrt);
968 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
969 ewitab = _mm_slli_epi32(ewitab,2);
970 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
971 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
972 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
973 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
974 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
975 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
976 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
977 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
978 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
979 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
981 d = _mm_sub_pd(r20,rswitch);
982 d = _mm_max_pd(d,_mm_setzero_pd());
983 d2 = _mm_mul_pd(d,d);
984 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
986 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
988 /* Evaluate switch function */
989 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
990 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
991 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
995 fscal = _mm_and_pd(fscal,cutoff_mask);
997 /* Calculate temporary vectorial force */
998 tx = _mm_mul_pd(fscal,dx20);
999 ty = _mm_mul_pd(fscal,dy20);
1000 tz = _mm_mul_pd(fscal,dz20);
1002 /* Update vectorial force */
1003 fix2 = _mm_add_pd(fix2,tx);
1004 fiy2 = _mm_add_pd(fiy2,ty);
1005 fiz2 = _mm_add_pd(fiz2,tz);
1007 fjx0 = _mm_add_pd(fjx0,tx);
1008 fjy0 = _mm_add_pd(fjy0,ty);
1009 fjz0 = _mm_add_pd(fjz0,tz);
1013 /**************************
1014 * CALCULATE INTERACTIONS *
1015 **************************/
1017 if (gmx_mm_any_lt(rsq30,rcutoff2))
1020 r30 = _mm_mul_pd(rsq30,rinv30);
1022 /* Compute parameters for interactions between i and j atoms */
1023 qq30 = _mm_mul_pd(iq3,jq0);
1025 /* EWALD ELECTROSTATICS */
1027 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1028 ewrt = _mm_mul_pd(r30,ewtabscale);
1029 ewitab = _mm_cvttpd_epi32(ewrt);
1030 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1031 ewitab = _mm_slli_epi32(ewitab,2);
1032 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1033 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1034 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1035 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1036 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1037 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1038 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1039 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1040 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
1041 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1043 d = _mm_sub_pd(r30,rswitch);
1044 d = _mm_max_pd(d,_mm_setzero_pd());
1045 d2 = _mm_mul_pd(d,d);
1046 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1048 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1050 /* Evaluate switch function */
1051 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1052 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
1053 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1057 fscal = _mm_and_pd(fscal,cutoff_mask);
1059 /* Calculate temporary vectorial force */
1060 tx = _mm_mul_pd(fscal,dx30);
1061 ty = _mm_mul_pd(fscal,dy30);
1062 tz = _mm_mul_pd(fscal,dz30);
1064 /* Update vectorial force */
1065 fix3 = _mm_add_pd(fix3,tx);
1066 fiy3 = _mm_add_pd(fiy3,ty);
1067 fiz3 = _mm_add_pd(fiz3,tz);
1069 fjx0 = _mm_add_pd(fjx0,tx);
1070 fjy0 = _mm_add_pd(fjy0,ty);
1071 fjz0 = _mm_add_pd(fjz0,tz);
1075 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1077 /* Inner loop uses 189 flops */
1080 if(jidx<j_index_end)
1084 j_coord_offsetA = DIM*jnrA;
1086 /* load j atom coordinates */
1087 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1090 /* Calculate displacement vector */
1091 dx10 = _mm_sub_pd(ix1,jx0);
1092 dy10 = _mm_sub_pd(iy1,jy0);
1093 dz10 = _mm_sub_pd(iz1,jz0);
1094 dx20 = _mm_sub_pd(ix2,jx0);
1095 dy20 = _mm_sub_pd(iy2,jy0);
1096 dz20 = _mm_sub_pd(iz2,jz0);
1097 dx30 = _mm_sub_pd(ix3,jx0);
1098 dy30 = _mm_sub_pd(iy3,jy0);
1099 dz30 = _mm_sub_pd(iz3,jz0);
1101 /* Calculate squared distance and things based on it */
1102 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1103 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1104 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1106 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1107 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1108 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1110 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1111 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1112 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1114 /* Load parameters for j particles */
1115 jq0 = _mm_load_sd(charge+jnrA+0);
1117 fjx0 = _mm_setzero_pd();
1118 fjy0 = _mm_setzero_pd();
1119 fjz0 = _mm_setzero_pd();
1121 /**************************
1122 * CALCULATE INTERACTIONS *
1123 **************************/
1125 if (gmx_mm_any_lt(rsq10,rcutoff2))
1128 r10 = _mm_mul_pd(rsq10,rinv10);
1130 /* Compute parameters for interactions between i and j atoms */
1131 qq10 = _mm_mul_pd(iq1,jq0);
1133 /* EWALD ELECTROSTATICS */
1135 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1136 ewrt = _mm_mul_pd(r10,ewtabscale);
1137 ewitab = _mm_cvttpd_epi32(ewrt);
1138 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1139 ewitab = _mm_slli_epi32(ewitab,2);
1140 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1141 ewtabD = _mm_setzero_pd();
1142 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1143 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1144 ewtabFn = _mm_setzero_pd();
1145 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1146 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1147 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1148 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1149 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1151 d = _mm_sub_pd(r10,rswitch);
1152 d = _mm_max_pd(d,_mm_setzero_pd());
1153 d2 = _mm_mul_pd(d,d);
1154 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1156 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1158 /* Evaluate switch function */
1159 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1160 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1161 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1165 fscal = _mm_and_pd(fscal,cutoff_mask);
1167 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1169 /* Calculate temporary vectorial force */
1170 tx = _mm_mul_pd(fscal,dx10);
1171 ty = _mm_mul_pd(fscal,dy10);
1172 tz = _mm_mul_pd(fscal,dz10);
1174 /* Update vectorial force */
1175 fix1 = _mm_add_pd(fix1,tx);
1176 fiy1 = _mm_add_pd(fiy1,ty);
1177 fiz1 = _mm_add_pd(fiz1,tz);
1179 fjx0 = _mm_add_pd(fjx0,tx);
1180 fjy0 = _mm_add_pd(fjy0,ty);
1181 fjz0 = _mm_add_pd(fjz0,tz);
1185 /**************************
1186 * CALCULATE INTERACTIONS *
1187 **************************/
1189 if (gmx_mm_any_lt(rsq20,rcutoff2))
1192 r20 = _mm_mul_pd(rsq20,rinv20);
1194 /* Compute parameters for interactions between i and j atoms */
1195 qq20 = _mm_mul_pd(iq2,jq0);
1197 /* EWALD ELECTROSTATICS */
1199 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1200 ewrt = _mm_mul_pd(r20,ewtabscale);
1201 ewitab = _mm_cvttpd_epi32(ewrt);
1202 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1203 ewitab = _mm_slli_epi32(ewitab,2);
1204 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1205 ewtabD = _mm_setzero_pd();
1206 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1207 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1208 ewtabFn = _mm_setzero_pd();
1209 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1210 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1211 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1212 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1213 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1215 d = _mm_sub_pd(r20,rswitch);
1216 d = _mm_max_pd(d,_mm_setzero_pd());
1217 d2 = _mm_mul_pd(d,d);
1218 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1220 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1222 /* Evaluate switch function */
1223 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1224 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1225 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1229 fscal = _mm_and_pd(fscal,cutoff_mask);
1231 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1233 /* Calculate temporary vectorial force */
1234 tx = _mm_mul_pd(fscal,dx20);
1235 ty = _mm_mul_pd(fscal,dy20);
1236 tz = _mm_mul_pd(fscal,dz20);
1238 /* Update vectorial force */
1239 fix2 = _mm_add_pd(fix2,tx);
1240 fiy2 = _mm_add_pd(fiy2,ty);
1241 fiz2 = _mm_add_pd(fiz2,tz);
1243 fjx0 = _mm_add_pd(fjx0,tx);
1244 fjy0 = _mm_add_pd(fjy0,ty);
1245 fjz0 = _mm_add_pd(fjz0,tz);
1249 /**************************
1250 * CALCULATE INTERACTIONS *
1251 **************************/
1253 if (gmx_mm_any_lt(rsq30,rcutoff2))
1256 r30 = _mm_mul_pd(rsq30,rinv30);
1258 /* Compute parameters for interactions between i and j atoms */
1259 qq30 = _mm_mul_pd(iq3,jq0);
1261 /* EWALD ELECTROSTATICS */
1263 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1264 ewrt = _mm_mul_pd(r30,ewtabscale);
1265 ewitab = _mm_cvttpd_epi32(ewrt);
1266 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1267 ewitab = _mm_slli_epi32(ewitab,2);
1268 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1269 ewtabD = _mm_setzero_pd();
1270 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1271 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1272 ewtabFn = _mm_setzero_pd();
1273 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1274 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1275 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1276 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
1277 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1279 d = _mm_sub_pd(r30,rswitch);
1280 d = _mm_max_pd(d,_mm_setzero_pd());
1281 d2 = _mm_mul_pd(d,d);
1282 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1284 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1286 /* Evaluate switch function */
1287 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1288 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
1289 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1293 fscal = _mm_and_pd(fscal,cutoff_mask);
1295 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1297 /* Calculate temporary vectorial force */
1298 tx = _mm_mul_pd(fscal,dx30);
1299 ty = _mm_mul_pd(fscal,dy30);
1300 tz = _mm_mul_pd(fscal,dz30);
1302 /* Update vectorial force */
1303 fix3 = _mm_add_pd(fix3,tx);
1304 fiy3 = _mm_add_pd(fiy3,ty);
1305 fiz3 = _mm_add_pd(fiz3,tz);
1307 fjx0 = _mm_add_pd(fjx0,tx);
1308 fjy0 = _mm_add_pd(fjy0,ty);
1309 fjz0 = _mm_add_pd(fjz0,tz);
1313 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1315 /* Inner loop uses 189 flops */
1318 /* End of innermost loop */
1320 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1321 f+i_coord_offset+DIM,fshift+i_shift_offset);
1323 /* Increment number of inner iterations */
1324 inneriter += j_index_end - j_index_start;
1326 /* Outer loop uses 18 flops */
1329 /* Increment number of outer iterations */
1332 /* Update outer/inner flops */
1334 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*189);