2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B;
94 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
112 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
113 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
114 real rswitch_scalar,d_scalar;
115 __m128d dummy_mask,cutoff_mask;
116 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117 __m128d one = _mm_set1_pd(1.0);
118 __m128d two = _mm_set1_pd(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_pd(fr->ic->epsfac);
131 charge = mdatoms->chargeA;
132 krf = _mm_set1_pd(fr->ic->k_rf);
133 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
134 crf = _mm_set1_pd(fr->ic->c_rf);
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
142 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
143 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
144 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146 jq1 = _mm_set1_pd(charge[inr+1]);
147 jq2 = _mm_set1_pd(charge[inr+2]);
148 jq3 = _mm_set1_pd(charge[inr+3]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
151 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
152 qq11 = _mm_mul_pd(iq1,jq1);
153 qq12 = _mm_mul_pd(iq1,jq2);
154 qq13 = _mm_mul_pd(iq1,jq3);
155 qq21 = _mm_mul_pd(iq2,jq1);
156 qq22 = _mm_mul_pd(iq2,jq2);
157 qq23 = _mm_mul_pd(iq2,jq3);
158 qq31 = _mm_mul_pd(iq3,jq1);
159 qq32 = _mm_mul_pd(iq3,jq2);
160 qq33 = _mm_mul_pd(iq3,jq3);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar = fr->ic->rcoulomb;
164 rcutoff = _mm_set1_pd(rcutoff_scalar);
165 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
167 rswitch_scalar = fr->ic->rvdw_switch;
168 rswitch = _mm_set1_pd(rswitch_scalar);
169 /* Setup switch parameters */
170 d_scalar = rcutoff_scalar-rswitch_scalar;
171 d = _mm_set1_pd(d_scalar);
172 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
173 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
175 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
176 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 /* Avoid stupid compiler warnings */
187 /* Start outer loop over neighborlists */
188 for(iidx=0; iidx<nri; iidx++)
190 /* Load shift vector for this list */
191 i_shift_offset = DIM*shiftidx[iidx];
193 /* Load limits for loop over neighbors */
194 j_index_start = jindex[iidx];
195 j_index_end = jindex[iidx+1];
197 /* Get outer coordinate index */
199 i_coord_offset = DIM*inr;
201 /* Load i particle coords and add shift vector */
202 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
203 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
205 fix0 = _mm_setzero_pd();
206 fiy0 = _mm_setzero_pd();
207 fiz0 = _mm_setzero_pd();
208 fix1 = _mm_setzero_pd();
209 fiy1 = _mm_setzero_pd();
210 fiz1 = _mm_setzero_pd();
211 fix2 = _mm_setzero_pd();
212 fiy2 = _mm_setzero_pd();
213 fiz2 = _mm_setzero_pd();
214 fix3 = _mm_setzero_pd();
215 fiy3 = _mm_setzero_pd();
216 fiz3 = _mm_setzero_pd();
218 /* Reset potential sums */
219 velecsum = _mm_setzero_pd();
220 vvdwsum = _mm_setzero_pd();
222 /* Start inner kernel loop */
223 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
226 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA = DIM*jnrA;
230 j_coord_offsetB = DIM*jnrB;
232 /* load j atom coordinates */
233 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
234 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
235 &jy2,&jz2,&jx3,&jy3,&jz3);
237 /* Calculate displacement vector */
238 dx00 = _mm_sub_pd(ix0,jx0);
239 dy00 = _mm_sub_pd(iy0,jy0);
240 dz00 = _mm_sub_pd(iz0,jz0);
241 dx11 = _mm_sub_pd(ix1,jx1);
242 dy11 = _mm_sub_pd(iy1,jy1);
243 dz11 = _mm_sub_pd(iz1,jz1);
244 dx12 = _mm_sub_pd(ix1,jx2);
245 dy12 = _mm_sub_pd(iy1,jy2);
246 dz12 = _mm_sub_pd(iz1,jz2);
247 dx13 = _mm_sub_pd(ix1,jx3);
248 dy13 = _mm_sub_pd(iy1,jy3);
249 dz13 = _mm_sub_pd(iz1,jz3);
250 dx21 = _mm_sub_pd(ix2,jx1);
251 dy21 = _mm_sub_pd(iy2,jy1);
252 dz21 = _mm_sub_pd(iz2,jz1);
253 dx22 = _mm_sub_pd(ix2,jx2);
254 dy22 = _mm_sub_pd(iy2,jy2);
255 dz22 = _mm_sub_pd(iz2,jz2);
256 dx23 = _mm_sub_pd(ix2,jx3);
257 dy23 = _mm_sub_pd(iy2,jy3);
258 dz23 = _mm_sub_pd(iz2,jz3);
259 dx31 = _mm_sub_pd(ix3,jx1);
260 dy31 = _mm_sub_pd(iy3,jy1);
261 dz31 = _mm_sub_pd(iz3,jz1);
262 dx32 = _mm_sub_pd(ix3,jx2);
263 dy32 = _mm_sub_pd(iy3,jy2);
264 dz32 = _mm_sub_pd(iz3,jz2);
265 dx33 = _mm_sub_pd(ix3,jx3);
266 dy33 = _mm_sub_pd(iy3,jy3);
267 dz33 = _mm_sub_pd(iz3,jz3);
269 /* Calculate squared distance and things based on it */
270 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
271 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
272 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
273 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
274 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
275 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
276 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
277 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
278 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
279 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
281 rinv00 = sse2_invsqrt_d(rsq00);
282 rinv11 = sse2_invsqrt_d(rsq11);
283 rinv12 = sse2_invsqrt_d(rsq12);
284 rinv13 = sse2_invsqrt_d(rsq13);
285 rinv21 = sse2_invsqrt_d(rsq21);
286 rinv22 = sse2_invsqrt_d(rsq22);
287 rinv23 = sse2_invsqrt_d(rsq23);
288 rinv31 = sse2_invsqrt_d(rsq31);
289 rinv32 = sse2_invsqrt_d(rsq32);
290 rinv33 = sse2_invsqrt_d(rsq33);
292 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
293 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
294 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
295 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
296 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
297 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
298 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
299 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
300 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
301 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
303 fjx0 = _mm_setzero_pd();
304 fjy0 = _mm_setzero_pd();
305 fjz0 = _mm_setzero_pd();
306 fjx1 = _mm_setzero_pd();
307 fjy1 = _mm_setzero_pd();
308 fjz1 = _mm_setzero_pd();
309 fjx2 = _mm_setzero_pd();
310 fjy2 = _mm_setzero_pd();
311 fjz2 = _mm_setzero_pd();
312 fjx3 = _mm_setzero_pd();
313 fjy3 = _mm_setzero_pd();
314 fjz3 = _mm_setzero_pd();
316 /**************************
317 * CALCULATE INTERACTIONS *
318 **************************/
320 if (gmx_mm_any_lt(rsq00,rcutoff2))
323 r00 = _mm_mul_pd(rsq00,rinv00);
325 /* LENNARD-JONES DISPERSION/REPULSION */
327 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
328 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
329 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
330 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
331 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
333 d = _mm_sub_pd(r00,rswitch);
334 d = _mm_max_pd(d,_mm_setzero_pd());
335 d2 = _mm_mul_pd(d,d);
336 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
338 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
340 /* Evaluate switch function */
341 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
342 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
343 vvdw = _mm_mul_pd(vvdw,sw);
344 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 vvdw = _mm_and_pd(vvdw,cutoff_mask);
348 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
352 fscal = _mm_and_pd(fscal,cutoff_mask);
354 /* Calculate temporary vectorial force */
355 tx = _mm_mul_pd(fscal,dx00);
356 ty = _mm_mul_pd(fscal,dy00);
357 tz = _mm_mul_pd(fscal,dz00);
359 /* Update vectorial force */
360 fix0 = _mm_add_pd(fix0,tx);
361 fiy0 = _mm_add_pd(fiy0,ty);
362 fiz0 = _mm_add_pd(fiz0,tz);
364 fjx0 = _mm_add_pd(fjx0,tx);
365 fjy0 = _mm_add_pd(fjy0,ty);
366 fjz0 = _mm_add_pd(fjz0,tz);
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
374 if (gmx_mm_any_lt(rsq11,rcutoff2))
377 /* REACTION-FIELD ELECTROSTATICS */
378 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
379 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
381 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
383 /* Update potential sum for this i atom from the interaction with this j atom. */
384 velec = _mm_and_pd(velec,cutoff_mask);
385 velecsum = _mm_add_pd(velecsum,velec);
389 fscal = _mm_and_pd(fscal,cutoff_mask);
391 /* Calculate temporary vectorial force */
392 tx = _mm_mul_pd(fscal,dx11);
393 ty = _mm_mul_pd(fscal,dy11);
394 tz = _mm_mul_pd(fscal,dz11);
396 /* Update vectorial force */
397 fix1 = _mm_add_pd(fix1,tx);
398 fiy1 = _mm_add_pd(fiy1,ty);
399 fiz1 = _mm_add_pd(fiz1,tz);
401 fjx1 = _mm_add_pd(fjx1,tx);
402 fjy1 = _mm_add_pd(fjy1,ty);
403 fjz1 = _mm_add_pd(fjz1,tz);
407 /**************************
408 * CALCULATE INTERACTIONS *
409 **************************/
411 if (gmx_mm_any_lt(rsq12,rcutoff2))
414 /* REACTION-FIELD ELECTROSTATICS */
415 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
416 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
418 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velec = _mm_and_pd(velec,cutoff_mask);
422 velecsum = _mm_add_pd(velecsum,velec);
426 fscal = _mm_and_pd(fscal,cutoff_mask);
428 /* Calculate temporary vectorial force */
429 tx = _mm_mul_pd(fscal,dx12);
430 ty = _mm_mul_pd(fscal,dy12);
431 tz = _mm_mul_pd(fscal,dz12);
433 /* Update vectorial force */
434 fix1 = _mm_add_pd(fix1,tx);
435 fiy1 = _mm_add_pd(fiy1,ty);
436 fiz1 = _mm_add_pd(fiz1,tz);
438 fjx2 = _mm_add_pd(fjx2,tx);
439 fjy2 = _mm_add_pd(fjy2,ty);
440 fjz2 = _mm_add_pd(fjz2,tz);
444 /**************************
445 * CALCULATE INTERACTIONS *
446 **************************/
448 if (gmx_mm_any_lt(rsq13,rcutoff2))
451 /* REACTION-FIELD ELECTROSTATICS */
452 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
453 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
455 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velec = _mm_and_pd(velec,cutoff_mask);
459 velecsum = _mm_add_pd(velecsum,velec);
463 fscal = _mm_and_pd(fscal,cutoff_mask);
465 /* Calculate temporary vectorial force */
466 tx = _mm_mul_pd(fscal,dx13);
467 ty = _mm_mul_pd(fscal,dy13);
468 tz = _mm_mul_pd(fscal,dz13);
470 /* Update vectorial force */
471 fix1 = _mm_add_pd(fix1,tx);
472 fiy1 = _mm_add_pd(fiy1,ty);
473 fiz1 = _mm_add_pd(fiz1,tz);
475 fjx3 = _mm_add_pd(fjx3,tx);
476 fjy3 = _mm_add_pd(fjy3,ty);
477 fjz3 = _mm_add_pd(fjz3,tz);
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 if (gmx_mm_any_lt(rsq21,rcutoff2))
488 /* REACTION-FIELD ELECTROSTATICS */
489 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
490 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
492 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velec = _mm_and_pd(velec,cutoff_mask);
496 velecsum = _mm_add_pd(velecsum,velec);
500 fscal = _mm_and_pd(fscal,cutoff_mask);
502 /* Calculate temporary vectorial force */
503 tx = _mm_mul_pd(fscal,dx21);
504 ty = _mm_mul_pd(fscal,dy21);
505 tz = _mm_mul_pd(fscal,dz21);
507 /* Update vectorial force */
508 fix2 = _mm_add_pd(fix2,tx);
509 fiy2 = _mm_add_pd(fiy2,ty);
510 fiz2 = _mm_add_pd(fiz2,tz);
512 fjx1 = _mm_add_pd(fjx1,tx);
513 fjy1 = _mm_add_pd(fjy1,ty);
514 fjz1 = _mm_add_pd(fjz1,tz);
518 /**************************
519 * CALCULATE INTERACTIONS *
520 **************************/
522 if (gmx_mm_any_lt(rsq22,rcutoff2))
525 /* REACTION-FIELD ELECTROSTATICS */
526 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
527 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
529 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
531 /* Update potential sum for this i atom from the interaction with this j atom. */
532 velec = _mm_and_pd(velec,cutoff_mask);
533 velecsum = _mm_add_pd(velecsum,velec);
537 fscal = _mm_and_pd(fscal,cutoff_mask);
539 /* Calculate temporary vectorial force */
540 tx = _mm_mul_pd(fscal,dx22);
541 ty = _mm_mul_pd(fscal,dy22);
542 tz = _mm_mul_pd(fscal,dz22);
544 /* Update vectorial force */
545 fix2 = _mm_add_pd(fix2,tx);
546 fiy2 = _mm_add_pd(fiy2,ty);
547 fiz2 = _mm_add_pd(fiz2,tz);
549 fjx2 = _mm_add_pd(fjx2,tx);
550 fjy2 = _mm_add_pd(fjy2,ty);
551 fjz2 = _mm_add_pd(fjz2,tz);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 if (gmx_mm_any_lt(rsq23,rcutoff2))
562 /* REACTION-FIELD ELECTROSTATICS */
563 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
564 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
566 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm_and_pd(velec,cutoff_mask);
570 velecsum = _mm_add_pd(velecsum,velec);
574 fscal = _mm_and_pd(fscal,cutoff_mask);
576 /* Calculate temporary vectorial force */
577 tx = _mm_mul_pd(fscal,dx23);
578 ty = _mm_mul_pd(fscal,dy23);
579 tz = _mm_mul_pd(fscal,dz23);
581 /* Update vectorial force */
582 fix2 = _mm_add_pd(fix2,tx);
583 fiy2 = _mm_add_pd(fiy2,ty);
584 fiz2 = _mm_add_pd(fiz2,tz);
586 fjx3 = _mm_add_pd(fjx3,tx);
587 fjy3 = _mm_add_pd(fjy3,ty);
588 fjz3 = _mm_add_pd(fjz3,tz);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 if (gmx_mm_any_lt(rsq31,rcutoff2))
599 /* REACTION-FIELD ELECTROSTATICS */
600 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
601 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
603 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
605 /* Update potential sum for this i atom from the interaction with this j atom. */
606 velec = _mm_and_pd(velec,cutoff_mask);
607 velecsum = _mm_add_pd(velecsum,velec);
611 fscal = _mm_and_pd(fscal,cutoff_mask);
613 /* Calculate temporary vectorial force */
614 tx = _mm_mul_pd(fscal,dx31);
615 ty = _mm_mul_pd(fscal,dy31);
616 tz = _mm_mul_pd(fscal,dz31);
618 /* Update vectorial force */
619 fix3 = _mm_add_pd(fix3,tx);
620 fiy3 = _mm_add_pd(fiy3,ty);
621 fiz3 = _mm_add_pd(fiz3,tz);
623 fjx1 = _mm_add_pd(fjx1,tx);
624 fjy1 = _mm_add_pd(fjy1,ty);
625 fjz1 = _mm_add_pd(fjz1,tz);
629 /**************************
630 * CALCULATE INTERACTIONS *
631 **************************/
633 if (gmx_mm_any_lt(rsq32,rcutoff2))
636 /* REACTION-FIELD ELECTROSTATICS */
637 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
638 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
640 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
642 /* Update potential sum for this i atom from the interaction with this j atom. */
643 velec = _mm_and_pd(velec,cutoff_mask);
644 velecsum = _mm_add_pd(velecsum,velec);
648 fscal = _mm_and_pd(fscal,cutoff_mask);
650 /* Calculate temporary vectorial force */
651 tx = _mm_mul_pd(fscal,dx32);
652 ty = _mm_mul_pd(fscal,dy32);
653 tz = _mm_mul_pd(fscal,dz32);
655 /* Update vectorial force */
656 fix3 = _mm_add_pd(fix3,tx);
657 fiy3 = _mm_add_pd(fiy3,ty);
658 fiz3 = _mm_add_pd(fiz3,tz);
660 fjx2 = _mm_add_pd(fjx2,tx);
661 fjy2 = _mm_add_pd(fjy2,ty);
662 fjz2 = _mm_add_pd(fjz2,tz);
666 /**************************
667 * CALCULATE INTERACTIONS *
668 **************************/
670 if (gmx_mm_any_lt(rsq33,rcutoff2))
673 /* REACTION-FIELD ELECTROSTATICS */
674 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
675 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
677 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
679 /* Update potential sum for this i atom from the interaction with this j atom. */
680 velec = _mm_and_pd(velec,cutoff_mask);
681 velecsum = _mm_add_pd(velecsum,velec);
685 fscal = _mm_and_pd(fscal,cutoff_mask);
687 /* Calculate temporary vectorial force */
688 tx = _mm_mul_pd(fscal,dx33);
689 ty = _mm_mul_pd(fscal,dy33);
690 tz = _mm_mul_pd(fscal,dz33);
692 /* Update vectorial force */
693 fix3 = _mm_add_pd(fix3,tx);
694 fiy3 = _mm_add_pd(fiy3,ty);
695 fiz3 = _mm_add_pd(fiz3,tz);
697 fjx3 = _mm_add_pd(fjx3,tx);
698 fjy3 = _mm_add_pd(fjy3,ty);
699 fjz3 = _mm_add_pd(fjz3,tz);
703 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
705 /* Inner loop uses 386 flops */
712 j_coord_offsetA = DIM*jnrA;
714 /* load j atom coordinates */
715 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
716 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
717 &jy2,&jz2,&jx3,&jy3,&jz3);
719 /* Calculate displacement vector */
720 dx00 = _mm_sub_pd(ix0,jx0);
721 dy00 = _mm_sub_pd(iy0,jy0);
722 dz00 = _mm_sub_pd(iz0,jz0);
723 dx11 = _mm_sub_pd(ix1,jx1);
724 dy11 = _mm_sub_pd(iy1,jy1);
725 dz11 = _mm_sub_pd(iz1,jz1);
726 dx12 = _mm_sub_pd(ix1,jx2);
727 dy12 = _mm_sub_pd(iy1,jy2);
728 dz12 = _mm_sub_pd(iz1,jz2);
729 dx13 = _mm_sub_pd(ix1,jx3);
730 dy13 = _mm_sub_pd(iy1,jy3);
731 dz13 = _mm_sub_pd(iz1,jz3);
732 dx21 = _mm_sub_pd(ix2,jx1);
733 dy21 = _mm_sub_pd(iy2,jy1);
734 dz21 = _mm_sub_pd(iz2,jz1);
735 dx22 = _mm_sub_pd(ix2,jx2);
736 dy22 = _mm_sub_pd(iy2,jy2);
737 dz22 = _mm_sub_pd(iz2,jz2);
738 dx23 = _mm_sub_pd(ix2,jx3);
739 dy23 = _mm_sub_pd(iy2,jy3);
740 dz23 = _mm_sub_pd(iz2,jz3);
741 dx31 = _mm_sub_pd(ix3,jx1);
742 dy31 = _mm_sub_pd(iy3,jy1);
743 dz31 = _mm_sub_pd(iz3,jz1);
744 dx32 = _mm_sub_pd(ix3,jx2);
745 dy32 = _mm_sub_pd(iy3,jy2);
746 dz32 = _mm_sub_pd(iz3,jz2);
747 dx33 = _mm_sub_pd(ix3,jx3);
748 dy33 = _mm_sub_pd(iy3,jy3);
749 dz33 = _mm_sub_pd(iz3,jz3);
751 /* Calculate squared distance and things based on it */
752 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
753 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
754 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
755 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
756 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
757 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
758 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
759 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
760 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
761 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
763 rinv00 = sse2_invsqrt_d(rsq00);
764 rinv11 = sse2_invsqrt_d(rsq11);
765 rinv12 = sse2_invsqrt_d(rsq12);
766 rinv13 = sse2_invsqrt_d(rsq13);
767 rinv21 = sse2_invsqrt_d(rsq21);
768 rinv22 = sse2_invsqrt_d(rsq22);
769 rinv23 = sse2_invsqrt_d(rsq23);
770 rinv31 = sse2_invsqrt_d(rsq31);
771 rinv32 = sse2_invsqrt_d(rsq32);
772 rinv33 = sse2_invsqrt_d(rsq33);
774 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
775 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
776 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
777 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
778 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
779 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
780 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
781 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
782 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
783 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
785 fjx0 = _mm_setzero_pd();
786 fjy0 = _mm_setzero_pd();
787 fjz0 = _mm_setzero_pd();
788 fjx1 = _mm_setzero_pd();
789 fjy1 = _mm_setzero_pd();
790 fjz1 = _mm_setzero_pd();
791 fjx2 = _mm_setzero_pd();
792 fjy2 = _mm_setzero_pd();
793 fjz2 = _mm_setzero_pd();
794 fjx3 = _mm_setzero_pd();
795 fjy3 = _mm_setzero_pd();
796 fjz3 = _mm_setzero_pd();
798 /**************************
799 * CALCULATE INTERACTIONS *
800 **************************/
802 if (gmx_mm_any_lt(rsq00,rcutoff2))
805 r00 = _mm_mul_pd(rsq00,rinv00);
807 /* LENNARD-JONES DISPERSION/REPULSION */
809 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
810 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
811 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
812 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
813 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
815 d = _mm_sub_pd(r00,rswitch);
816 d = _mm_max_pd(d,_mm_setzero_pd());
817 d2 = _mm_mul_pd(d,d);
818 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
820 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
822 /* Evaluate switch function */
823 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
824 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
825 vvdw = _mm_mul_pd(vvdw,sw);
826 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
828 /* Update potential sum for this i atom from the interaction with this j atom. */
829 vvdw = _mm_and_pd(vvdw,cutoff_mask);
830 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
831 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
835 fscal = _mm_and_pd(fscal,cutoff_mask);
837 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
839 /* Calculate temporary vectorial force */
840 tx = _mm_mul_pd(fscal,dx00);
841 ty = _mm_mul_pd(fscal,dy00);
842 tz = _mm_mul_pd(fscal,dz00);
844 /* Update vectorial force */
845 fix0 = _mm_add_pd(fix0,tx);
846 fiy0 = _mm_add_pd(fiy0,ty);
847 fiz0 = _mm_add_pd(fiz0,tz);
849 fjx0 = _mm_add_pd(fjx0,tx);
850 fjy0 = _mm_add_pd(fjy0,ty);
851 fjz0 = _mm_add_pd(fjz0,tz);
855 /**************************
856 * CALCULATE INTERACTIONS *
857 **************************/
859 if (gmx_mm_any_lt(rsq11,rcutoff2))
862 /* REACTION-FIELD ELECTROSTATICS */
863 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
864 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
866 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
868 /* Update potential sum for this i atom from the interaction with this j atom. */
869 velec = _mm_and_pd(velec,cutoff_mask);
870 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
871 velecsum = _mm_add_pd(velecsum,velec);
875 fscal = _mm_and_pd(fscal,cutoff_mask);
877 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
879 /* Calculate temporary vectorial force */
880 tx = _mm_mul_pd(fscal,dx11);
881 ty = _mm_mul_pd(fscal,dy11);
882 tz = _mm_mul_pd(fscal,dz11);
884 /* Update vectorial force */
885 fix1 = _mm_add_pd(fix1,tx);
886 fiy1 = _mm_add_pd(fiy1,ty);
887 fiz1 = _mm_add_pd(fiz1,tz);
889 fjx1 = _mm_add_pd(fjx1,tx);
890 fjy1 = _mm_add_pd(fjy1,ty);
891 fjz1 = _mm_add_pd(fjz1,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 if (gmx_mm_any_lt(rsq12,rcutoff2))
902 /* REACTION-FIELD ELECTROSTATICS */
903 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
904 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
906 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
908 /* Update potential sum for this i atom from the interaction with this j atom. */
909 velec = _mm_and_pd(velec,cutoff_mask);
910 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
911 velecsum = _mm_add_pd(velecsum,velec);
915 fscal = _mm_and_pd(fscal,cutoff_mask);
917 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
919 /* Calculate temporary vectorial force */
920 tx = _mm_mul_pd(fscal,dx12);
921 ty = _mm_mul_pd(fscal,dy12);
922 tz = _mm_mul_pd(fscal,dz12);
924 /* Update vectorial force */
925 fix1 = _mm_add_pd(fix1,tx);
926 fiy1 = _mm_add_pd(fiy1,ty);
927 fiz1 = _mm_add_pd(fiz1,tz);
929 fjx2 = _mm_add_pd(fjx2,tx);
930 fjy2 = _mm_add_pd(fjy2,ty);
931 fjz2 = _mm_add_pd(fjz2,tz);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 if (gmx_mm_any_lt(rsq13,rcutoff2))
942 /* REACTION-FIELD ELECTROSTATICS */
943 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
944 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
946 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec = _mm_and_pd(velec,cutoff_mask);
950 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
951 velecsum = _mm_add_pd(velecsum,velec);
955 fscal = _mm_and_pd(fscal,cutoff_mask);
957 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
959 /* Calculate temporary vectorial force */
960 tx = _mm_mul_pd(fscal,dx13);
961 ty = _mm_mul_pd(fscal,dy13);
962 tz = _mm_mul_pd(fscal,dz13);
964 /* Update vectorial force */
965 fix1 = _mm_add_pd(fix1,tx);
966 fiy1 = _mm_add_pd(fiy1,ty);
967 fiz1 = _mm_add_pd(fiz1,tz);
969 fjx3 = _mm_add_pd(fjx3,tx);
970 fjy3 = _mm_add_pd(fjy3,ty);
971 fjz3 = _mm_add_pd(fjz3,tz);
975 /**************************
976 * CALCULATE INTERACTIONS *
977 **************************/
979 if (gmx_mm_any_lt(rsq21,rcutoff2))
982 /* REACTION-FIELD ELECTROSTATICS */
983 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
984 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
986 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
988 /* Update potential sum for this i atom from the interaction with this j atom. */
989 velec = _mm_and_pd(velec,cutoff_mask);
990 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
991 velecsum = _mm_add_pd(velecsum,velec);
995 fscal = _mm_and_pd(fscal,cutoff_mask);
997 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
999 /* Calculate temporary vectorial force */
1000 tx = _mm_mul_pd(fscal,dx21);
1001 ty = _mm_mul_pd(fscal,dy21);
1002 tz = _mm_mul_pd(fscal,dz21);
1004 /* Update vectorial force */
1005 fix2 = _mm_add_pd(fix2,tx);
1006 fiy2 = _mm_add_pd(fiy2,ty);
1007 fiz2 = _mm_add_pd(fiz2,tz);
1009 fjx1 = _mm_add_pd(fjx1,tx);
1010 fjy1 = _mm_add_pd(fjy1,ty);
1011 fjz1 = _mm_add_pd(fjz1,tz);
1015 /**************************
1016 * CALCULATE INTERACTIONS *
1017 **************************/
1019 if (gmx_mm_any_lt(rsq22,rcutoff2))
1022 /* REACTION-FIELD ELECTROSTATICS */
1023 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1024 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1026 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1028 /* Update potential sum for this i atom from the interaction with this j atom. */
1029 velec = _mm_and_pd(velec,cutoff_mask);
1030 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1031 velecsum = _mm_add_pd(velecsum,velec);
1035 fscal = _mm_and_pd(fscal,cutoff_mask);
1037 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1039 /* Calculate temporary vectorial force */
1040 tx = _mm_mul_pd(fscal,dx22);
1041 ty = _mm_mul_pd(fscal,dy22);
1042 tz = _mm_mul_pd(fscal,dz22);
1044 /* Update vectorial force */
1045 fix2 = _mm_add_pd(fix2,tx);
1046 fiy2 = _mm_add_pd(fiy2,ty);
1047 fiz2 = _mm_add_pd(fiz2,tz);
1049 fjx2 = _mm_add_pd(fjx2,tx);
1050 fjy2 = _mm_add_pd(fjy2,ty);
1051 fjz2 = _mm_add_pd(fjz2,tz);
1055 /**************************
1056 * CALCULATE INTERACTIONS *
1057 **************************/
1059 if (gmx_mm_any_lt(rsq23,rcutoff2))
1062 /* REACTION-FIELD ELECTROSTATICS */
1063 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
1064 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1066 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1068 /* Update potential sum for this i atom from the interaction with this j atom. */
1069 velec = _mm_and_pd(velec,cutoff_mask);
1070 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1071 velecsum = _mm_add_pd(velecsum,velec);
1075 fscal = _mm_and_pd(fscal,cutoff_mask);
1077 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1079 /* Calculate temporary vectorial force */
1080 tx = _mm_mul_pd(fscal,dx23);
1081 ty = _mm_mul_pd(fscal,dy23);
1082 tz = _mm_mul_pd(fscal,dz23);
1084 /* Update vectorial force */
1085 fix2 = _mm_add_pd(fix2,tx);
1086 fiy2 = _mm_add_pd(fiy2,ty);
1087 fiz2 = _mm_add_pd(fiz2,tz);
1089 fjx3 = _mm_add_pd(fjx3,tx);
1090 fjy3 = _mm_add_pd(fjy3,ty);
1091 fjz3 = _mm_add_pd(fjz3,tz);
1095 /**************************
1096 * CALCULATE INTERACTIONS *
1097 **************************/
1099 if (gmx_mm_any_lt(rsq31,rcutoff2))
1102 /* REACTION-FIELD ELECTROSTATICS */
1103 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
1104 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1106 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1108 /* Update potential sum for this i atom from the interaction with this j atom. */
1109 velec = _mm_and_pd(velec,cutoff_mask);
1110 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1111 velecsum = _mm_add_pd(velecsum,velec);
1115 fscal = _mm_and_pd(fscal,cutoff_mask);
1117 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1119 /* Calculate temporary vectorial force */
1120 tx = _mm_mul_pd(fscal,dx31);
1121 ty = _mm_mul_pd(fscal,dy31);
1122 tz = _mm_mul_pd(fscal,dz31);
1124 /* Update vectorial force */
1125 fix3 = _mm_add_pd(fix3,tx);
1126 fiy3 = _mm_add_pd(fiy3,ty);
1127 fiz3 = _mm_add_pd(fiz3,tz);
1129 fjx1 = _mm_add_pd(fjx1,tx);
1130 fjy1 = _mm_add_pd(fjy1,ty);
1131 fjz1 = _mm_add_pd(fjz1,tz);
1135 /**************************
1136 * CALCULATE INTERACTIONS *
1137 **************************/
1139 if (gmx_mm_any_lt(rsq32,rcutoff2))
1142 /* REACTION-FIELD ELECTROSTATICS */
1143 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
1144 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1146 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1148 /* Update potential sum for this i atom from the interaction with this j atom. */
1149 velec = _mm_and_pd(velec,cutoff_mask);
1150 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1151 velecsum = _mm_add_pd(velecsum,velec);
1155 fscal = _mm_and_pd(fscal,cutoff_mask);
1157 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1159 /* Calculate temporary vectorial force */
1160 tx = _mm_mul_pd(fscal,dx32);
1161 ty = _mm_mul_pd(fscal,dy32);
1162 tz = _mm_mul_pd(fscal,dz32);
1164 /* Update vectorial force */
1165 fix3 = _mm_add_pd(fix3,tx);
1166 fiy3 = _mm_add_pd(fiy3,ty);
1167 fiz3 = _mm_add_pd(fiz3,tz);
1169 fjx2 = _mm_add_pd(fjx2,tx);
1170 fjy2 = _mm_add_pd(fjy2,ty);
1171 fjz2 = _mm_add_pd(fjz2,tz);
1175 /**************************
1176 * CALCULATE INTERACTIONS *
1177 **************************/
1179 if (gmx_mm_any_lt(rsq33,rcutoff2))
1182 /* REACTION-FIELD ELECTROSTATICS */
1183 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
1184 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1186 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec = _mm_and_pd(velec,cutoff_mask);
1190 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1191 velecsum = _mm_add_pd(velecsum,velec);
1195 fscal = _mm_and_pd(fscal,cutoff_mask);
1197 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1199 /* Calculate temporary vectorial force */
1200 tx = _mm_mul_pd(fscal,dx33);
1201 ty = _mm_mul_pd(fscal,dy33);
1202 tz = _mm_mul_pd(fscal,dz33);
1204 /* Update vectorial force */
1205 fix3 = _mm_add_pd(fix3,tx);
1206 fiy3 = _mm_add_pd(fiy3,ty);
1207 fiz3 = _mm_add_pd(fiz3,tz);
1209 fjx3 = _mm_add_pd(fjx3,tx);
1210 fjy3 = _mm_add_pd(fjy3,ty);
1211 fjz3 = _mm_add_pd(fjz3,tz);
1215 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1217 /* Inner loop uses 386 flops */
1220 /* End of innermost loop */
1222 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1223 f+i_coord_offset,fshift+i_shift_offset);
1226 /* Update potential energies */
1227 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1228 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1230 /* Increment number of inner iterations */
1231 inneriter += j_index_end - j_index_start;
1233 /* Outer loop uses 26 flops */
1236 /* Increment number of outer iterations */
1239 /* Update outer/inner flops */
1241 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*386);
1244 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double
1245 * Electrostatics interaction: ReactionField
1246 * VdW interaction: LennardJones
1247 * Geometry: Water4-Water4
1248 * Calculate force/pot: Force
1251 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double
1252 (t_nblist * gmx_restrict nlist,
1253 rvec * gmx_restrict xx,
1254 rvec * gmx_restrict ff,
1255 struct t_forcerec * gmx_restrict fr,
1256 t_mdatoms * gmx_restrict mdatoms,
1257 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1258 t_nrnb * gmx_restrict nrnb)
1260 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1261 * just 0 for non-waters.
1262 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1263 * jnr indices corresponding to data put in the four positions in the SIMD register.
1265 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1266 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1268 int j_coord_offsetA,j_coord_offsetB;
1269 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1270 real rcutoff_scalar;
1271 real *shiftvec,*fshift,*x,*f;
1272 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1274 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1276 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1278 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1280 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1281 int vdwjidx0A,vdwjidx0B;
1282 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1283 int vdwjidx1A,vdwjidx1B;
1284 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1285 int vdwjidx2A,vdwjidx2B;
1286 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1287 int vdwjidx3A,vdwjidx3B;
1288 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1289 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1290 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1291 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1292 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1293 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1294 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1295 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1296 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1297 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1298 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1299 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1302 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1305 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1306 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1307 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1308 real rswitch_scalar,d_scalar;
1309 __m128d dummy_mask,cutoff_mask;
1310 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1311 __m128d one = _mm_set1_pd(1.0);
1312 __m128d two = _mm_set1_pd(2.0);
1318 jindex = nlist->jindex;
1320 shiftidx = nlist->shift;
1322 shiftvec = fr->shift_vec[0];
1323 fshift = fr->fshift[0];
1324 facel = _mm_set1_pd(fr->ic->epsfac);
1325 charge = mdatoms->chargeA;
1326 krf = _mm_set1_pd(fr->ic->k_rf);
1327 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1328 crf = _mm_set1_pd(fr->ic->c_rf);
1329 nvdwtype = fr->ntype;
1330 vdwparam = fr->nbfp;
1331 vdwtype = mdatoms->typeA;
1333 /* Setup water-specific parameters */
1334 inr = nlist->iinr[0];
1335 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1336 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1337 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1338 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1340 jq1 = _mm_set1_pd(charge[inr+1]);
1341 jq2 = _mm_set1_pd(charge[inr+2]);
1342 jq3 = _mm_set1_pd(charge[inr+3]);
1343 vdwjidx0A = 2*vdwtype[inr+0];
1344 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1345 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1346 qq11 = _mm_mul_pd(iq1,jq1);
1347 qq12 = _mm_mul_pd(iq1,jq2);
1348 qq13 = _mm_mul_pd(iq1,jq3);
1349 qq21 = _mm_mul_pd(iq2,jq1);
1350 qq22 = _mm_mul_pd(iq2,jq2);
1351 qq23 = _mm_mul_pd(iq2,jq3);
1352 qq31 = _mm_mul_pd(iq3,jq1);
1353 qq32 = _mm_mul_pd(iq3,jq2);
1354 qq33 = _mm_mul_pd(iq3,jq3);
1356 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1357 rcutoff_scalar = fr->ic->rcoulomb;
1358 rcutoff = _mm_set1_pd(rcutoff_scalar);
1359 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1361 rswitch_scalar = fr->ic->rvdw_switch;
1362 rswitch = _mm_set1_pd(rswitch_scalar);
1363 /* Setup switch parameters */
1364 d_scalar = rcutoff_scalar-rswitch_scalar;
1365 d = _mm_set1_pd(d_scalar);
1366 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1367 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1368 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1369 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1370 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1371 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1373 /* Avoid stupid compiler warnings */
1375 j_coord_offsetA = 0;
1376 j_coord_offsetB = 0;
1381 /* Start outer loop over neighborlists */
1382 for(iidx=0; iidx<nri; iidx++)
1384 /* Load shift vector for this list */
1385 i_shift_offset = DIM*shiftidx[iidx];
1387 /* Load limits for loop over neighbors */
1388 j_index_start = jindex[iidx];
1389 j_index_end = jindex[iidx+1];
1391 /* Get outer coordinate index */
1393 i_coord_offset = DIM*inr;
1395 /* Load i particle coords and add shift vector */
1396 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1397 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1399 fix0 = _mm_setzero_pd();
1400 fiy0 = _mm_setzero_pd();
1401 fiz0 = _mm_setzero_pd();
1402 fix1 = _mm_setzero_pd();
1403 fiy1 = _mm_setzero_pd();
1404 fiz1 = _mm_setzero_pd();
1405 fix2 = _mm_setzero_pd();
1406 fiy2 = _mm_setzero_pd();
1407 fiz2 = _mm_setzero_pd();
1408 fix3 = _mm_setzero_pd();
1409 fiy3 = _mm_setzero_pd();
1410 fiz3 = _mm_setzero_pd();
1412 /* Start inner kernel loop */
1413 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1416 /* Get j neighbor index, and coordinate index */
1418 jnrB = jjnr[jidx+1];
1419 j_coord_offsetA = DIM*jnrA;
1420 j_coord_offsetB = DIM*jnrB;
1422 /* load j atom coordinates */
1423 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1424 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1425 &jy2,&jz2,&jx3,&jy3,&jz3);
1427 /* Calculate displacement vector */
1428 dx00 = _mm_sub_pd(ix0,jx0);
1429 dy00 = _mm_sub_pd(iy0,jy0);
1430 dz00 = _mm_sub_pd(iz0,jz0);
1431 dx11 = _mm_sub_pd(ix1,jx1);
1432 dy11 = _mm_sub_pd(iy1,jy1);
1433 dz11 = _mm_sub_pd(iz1,jz1);
1434 dx12 = _mm_sub_pd(ix1,jx2);
1435 dy12 = _mm_sub_pd(iy1,jy2);
1436 dz12 = _mm_sub_pd(iz1,jz2);
1437 dx13 = _mm_sub_pd(ix1,jx3);
1438 dy13 = _mm_sub_pd(iy1,jy3);
1439 dz13 = _mm_sub_pd(iz1,jz3);
1440 dx21 = _mm_sub_pd(ix2,jx1);
1441 dy21 = _mm_sub_pd(iy2,jy1);
1442 dz21 = _mm_sub_pd(iz2,jz1);
1443 dx22 = _mm_sub_pd(ix2,jx2);
1444 dy22 = _mm_sub_pd(iy2,jy2);
1445 dz22 = _mm_sub_pd(iz2,jz2);
1446 dx23 = _mm_sub_pd(ix2,jx3);
1447 dy23 = _mm_sub_pd(iy2,jy3);
1448 dz23 = _mm_sub_pd(iz2,jz3);
1449 dx31 = _mm_sub_pd(ix3,jx1);
1450 dy31 = _mm_sub_pd(iy3,jy1);
1451 dz31 = _mm_sub_pd(iz3,jz1);
1452 dx32 = _mm_sub_pd(ix3,jx2);
1453 dy32 = _mm_sub_pd(iy3,jy2);
1454 dz32 = _mm_sub_pd(iz3,jz2);
1455 dx33 = _mm_sub_pd(ix3,jx3);
1456 dy33 = _mm_sub_pd(iy3,jy3);
1457 dz33 = _mm_sub_pd(iz3,jz3);
1459 /* Calculate squared distance and things based on it */
1460 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1461 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1462 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1463 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1464 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1465 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1466 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1467 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1468 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1469 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1471 rinv00 = sse2_invsqrt_d(rsq00);
1472 rinv11 = sse2_invsqrt_d(rsq11);
1473 rinv12 = sse2_invsqrt_d(rsq12);
1474 rinv13 = sse2_invsqrt_d(rsq13);
1475 rinv21 = sse2_invsqrt_d(rsq21);
1476 rinv22 = sse2_invsqrt_d(rsq22);
1477 rinv23 = sse2_invsqrt_d(rsq23);
1478 rinv31 = sse2_invsqrt_d(rsq31);
1479 rinv32 = sse2_invsqrt_d(rsq32);
1480 rinv33 = sse2_invsqrt_d(rsq33);
1482 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1483 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1484 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1485 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1486 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1487 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1488 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1489 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1490 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1491 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1493 fjx0 = _mm_setzero_pd();
1494 fjy0 = _mm_setzero_pd();
1495 fjz0 = _mm_setzero_pd();
1496 fjx1 = _mm_setzero_pd();
1497 fjy1 = _mm_setzero_pd();
1498 fjz1 = _mm_setzero_pd();
1499 fjx2 = _mm_setzero_pd();
1500 fjy2 = _mm_setzero_pd();
1501 fjz2 = _mm_setzero_pd();
1502 fjx3 = _mm_setzero_pd();
1503 fjy3 = _mm_setzero_pd();
1504 fjz3 = _mm_setzero_pd();
1506 /**************************
1507 * CALCULATE INTERACTIONS *
1508 **************************/
1510 if (gmx_mm_any_lt(rsq00,rcutoff2))
1513 r00 = _mm_mul_pd(rsq00,rinv00);
1515 /* LENNARD-JONES DISPERSION/REPULSION */
1517 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1518 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1519 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1520 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1521 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1523 d = _mm_sub_pd(r00,rswitch);
1524 d = _mm_max_pd(d,_mm_setzero_pd());
1525 d2 = _mm_mul_pd(d,d);
1526 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1528 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1530 /* Evaluate switch function */
1531 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1532 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1533 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1537 fscal = _mm_and_pd(fscal,cutoff_mask);
1539 /* Calculate temporary vectorial force */
1540 tx = _mm_mul_pd(fscal,dx00);
1541 ty = _mm_mul_pd(fscal,dy00);
1542 tz = _mm_mul_pd(fscal,dz00);
1544 /* Update vectorial force */
1545 fix0 = _mm_add_pd(fix0,tx);
1546 fiy0 = _mm_add_pd(fiy0,ty);
1547 fiz0 = _mm_add_pd(fiz0,tz);
1549 fjx0 = _mm_add_pd(fjx0,tx);
1550 fjy0 = _mm_add_pd(fjy0,ty);
1551 fjz0 = _mm_add_pd(fjz0,tz);
1555 /**************************
1556 * CALCULATE INTERACTIONS *
1557 **************************/
1559 if (gmx_mm_any_lt(rsq11,rcutoff2))
1562 /* REACTION-FIELD ELECTROSTATICS */
1563 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1565 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1569 fscal = _mm_and_pd(fscal,cutoff_mask);
1571 /* Calculate temporary vectorial force */
1572 tx = _mm_mul_pd(fscal,dx11);
1573 ty = _mm_mul_pd(fscal,dy11);
1574 tz = _mm_mul_pd(fscal,dz11);
1576 /* Update vectorial force */
1577 fix1 = _mm_add_pd(fix1,tx);
1578 fiy1 = _mm_add_pd(fiy1,ty);
1579 fiz1 = _mm_add_pd(fiz1,tz);
1581 fjx1 = _mm_add_pd(fjx1,tx);
1582 fjy1 = _mm_add_pd(fjy1,ty);
1583 fjz1 = _mm_add_pd(fjz1,tz);
1587 /**************************
1588 * CALCULATE INTERACTIONS *
1589 **************************/
1591 if (gmx_mm_any_lt(rsq12,rcutoff2))
1594 /* REACTION-FIELD ELECTROSTATICS */
1595 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1597 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1601 fscal = _mm_and_pd(fscal,cutoff_mask);
1603 /* Calculate temporary vectorial force */
1604 tx = _mm_mul_pd(fscal,dx12);
1605 ty = _mm_mul_pd(fscal,dy12);
1606 tz = _mm_mul_pd(fscal,dz12);
1608 /* Update vectorial force */
1609 fix1 = _mm_add_pd(fix1,tx);
1610 fiy1 = _mm_add_pd(fiy1,ty);
1611 fiz1 = _mm_add_pd(fiz1,tz);
1613 fjx2 = _mm_add_pd(fjx2,tx);
1614 fjy2 = _mm_add_pd(fjy2,ty);
1615 fjz2 = _mm_add_pd(fjz2,tz);
1619 /**************************
1620 * CALCULATE INTERACTIONS *
1621 **************************/
1623 if (gmx_mm_any_lt(rsq13,rcutoff2))
1626 /* REACTION-FIELD ELECTROSTATICS */
1627 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1629 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1633 fscal = _mm_and_pd(fscal,cutoff_mask);
1635 /* Calculate temporary vectorial force */
1636 tx = _mm_mul_pd(fscal,dx13);
1637 ty = _mm_mul_pd(fscal,dy13);
1638 tz = _mm_mul_pd(fscal,dz13);
1640 /* Update vectorial force */
1641 fix1 = _mm_add_pd(fix1,tx);
1642 fiy1 = _mm_add_pd(fiy1,ty);
1643 fiz1 = _mm_add_pd(fiz1,tz);
1645 fjx3 = _mm_add_pd(fjx3,tx);
1646 fjy3 = _mm_add_pd(fjy3,ty);
1647 fjz3 = _mm_add_pd(fjz3,tz);
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 if (gmx_mm_any_lt(rsq21,rcutoff2))
1658 /* REACTION-FIELD ELECTROSTATICS */
1659 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1661 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1665 fscal = _mm_and_pd(fscal,cutoff_mask);
1667 /* Calculate temporary vectorial force */
1668 tx = _mm_mul_pd(fscal,dx21);
1669 ty = _mm_mul_pd(fscal,dy21);
1670 tz = _mm_mul_pd(fscal,dz21);
1672 /* Update vectorial force */
1673 fix2 = _mm_add_pd(fix2,tx);
1674 fiy2 = _mm_add_pd(fiy2,ty);
1675 fiz2 = _mm_add_pd(fiz2,tz);
1677 fjx1 = _mm_add_pd(fjx1,tx);
1678 fjy1 = _mm_add_pd(fjy1,ty);
1679 fjz1 = _mm_add_pd(fjz1,tz);
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1687 if (gmx_mm_any_lt(rsq22,rcutoff2))
1690 /* REACTION-FIELD ELECTROSTATICS */
1691 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1693 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1697 fscal = _mm_and_pd(fscal,cutoff_mask);
1699 /* Calculate temporary vectorial force */
1700 tx = _mm_mul_pd(fscal,dx22);
1701 ty = _mm_mul_pd(fscal,dy22);
1702 tz = _mm_mul_pd(fscal,dz22);
1704 /* Update vectorial force */
1705 fix2 = _mm_add_pd(fix2,tx);
1706 fiy2 = _mm_add_pd(fiy2,ty);
1707 fiz2 = _mm_add_pd(fiz2,tz);
1709 fjx2 = _mm_add_pd(fjx2,tx);
1710 fjy2 = _mm_add_pd(fjy2,ty);
1711 fjz2 = _mm_add_pd(fjz2,tz);
1715 /**************************
1716 * CALCULATE INTERACTIONS *
1717 **************************/
1719 if (gmx_mm_any_lt(rsq23,rcutoff2))
1722 /* REACTION-FIELD ELECTROSTATICS */
1723 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1725 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1729 fscal = _mm_and_pd(fscal,cutoff_mask);
1731 /* Calculate temporary vectorial force */
1732 tx = _mm_mul_pd(fscal,dx23);
1733 ty = _mm_mul_pd(fscal,dy23);
1734 tz = _mm_mul_pd(fscal,dz23);
1736 /* Update vectorial force */
1737 fix2 = _mm_add_pd(fix2,tx);
1738 fiy2 = _mm_add_pd(fiy2,ty);
1739 fiz2 = _mm_add_pd(fiz2,tz);
1741 fjx3 = _mm_add_pd(fjx3,tx);
1742 fjy3 = _mm_add_pd(fjy3,ty);
1743 fjz3 = _mm_add_pd(fjz3,tz);
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 if (gmx_mm_any_lt(rsq31,rcutoff2))
1754 /* REACTION-FIELD ELECTROSTATICS */
1755 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1757 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1761 fscal = _mm_and_pd(fscal,cutoff_mask);
1763 /* Calculate temporary vectorial force */
1764 tx = _mm_mul_pd(fscal,dx31);
1765 ty = _mm_mul_pd(fscal,dy31);
1766 tz = _mm_mul_pd(fscal,dz31);
1768 /* Update vectorial force */
1769 fix3 = _mm_add_pd(fix3,tx);
1770 fiy3 = _mm_add_pd(fiy3,ty);
1771 fiz3 = _mm_add_pd(fiz3,tz);
1773 fjx1 = _mm_add_pd(fjx1,tx);
1774 fjy1 = _mm_add_pd(fjy1,ty);
1775 fjz1 = _mm_add_pd(fjz1,tz);
1779 /**************************
1780 * CALCULATE INTERACTIONS *
1781 **************************/
1783 if (gmx_mm_any_lt(rsq32,rcutoff2))
1786 /* REACTION-FIELD ELECTROSTATICS */
1787 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1789 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1793 fscal = _mm_and_pd(fscal,cutoff_mask);
1795 /* Calculate temporary vectorial force */
1796 tx = _mm_mul_pd(fscal,dx32);
1797 ty = _mm_mul_pd(fscal,dy32);
1798 tz = _mm_mul_pd(fscal,dz32);
1800 /* Update vectorial force */
1801 fix3 = _mm_add_pd(fix3,tx);
1802 fiy3 = _mm_add_pd(fiy3,ty);
1803 fiz3 = _mm_add_pd(fiz3,tz);
1805 fjx2 = _mm_add_pd(fjx2,tx);
1806 fjy2 = _mm_add_pd(fjy2,ty);
1807 fjz2 = _mm_add_pd(fjz2,tz);
1811 /**************************
1812 * CALCULATE INTERACTIONS *
1813 **************************/
1815 if (gmx_mm_any_lt(rsq33,rcutoff2))
1818 /* REACTION-FIELD ELECTROSTATICS */
1819 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1821 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1825 fscal = _mm_and_pd(fscal,cutoff_mask);
1827 /* Calculate temporary vectorial force */
1828 tx = _mm_mul_pd(fscal,dx33);
1829 ty = _mm_mul_pd(fscal,dy33);
1830 tz = _mm_mul_pd(fscal,dz33);
1832 /* Update vectorial force */
1833 fix3 = _mm_add_pd(fix3,tx);
1834 fiy3 = _mm_add_pd(fiy3,ty);
1835 fiz3 = _mm_add_pd(fiz3,tz);
1837 fjx3 = _mm_add_pd(fjx3,tx);
1838 fjy3 = _mm_add_pd(fjy3,ty);
1839 fjz3 = _mm_add_pd(fjz3,tz);
1843 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1845 /* Inner loop uses 329 flops */
1848 if(jidx<j_index_end)
1852 j_coord_offsetA = DIM*jnrA;
1854 /* load j atom coordinates */
1855 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1856 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1857 &jy2,&jz2,&jx3,&jy3,&jz3);
1859 /* Calculate displacement vector */
1860 dx00 = _mm_sub_pd(ix0,jx0);
1861 dy00 = _mm_sub_pd(iy0,jy0);
1862 dz00 = _mm_sub_pd(iz0,jz0);
1863 dx11 = _mm_sub_pd(ix1,jx1);
1864 dy11 = _mm_sub_pd(iy1,jy1);
1865 dz11 = _mm_sub_pd(iz1,jz1);
1866 dx12 = _mm_sub_pd(ix1,jx2);
1867 dy12 = _mm_sub_pd(iy1,jy2);
1868 dz12 = _mm_sub_pd(iz1,jz2);
1869 dx13 = _mm_sub_pd(ix1,jx3);
1870 dy13 = _mm_sub_pd(iy1,jy3);
1871 dz13 = _mm_sub_pd(iz1,jz3);
1872 dx21 = _mm_sub_pd(ix2,jx1);
1873 dy21 = _mm_sub_pd(iy2,jy1);
1874 dz21 = _mm_sub_pd(iz2,jz1);
1875 dx22 = _mm_sub_pd(ix2,jx2);
1876 dy22 = _mm_sub_pd(iy2,jy2);
1877 dz22 = _mm_sub_pd(iz2,jz2);
1878 dx23 = _mm_sub_pd(ix2,jx3);
1879 dy23 = _mm_sub_pd(iy2,jy3);
1880 dz23 = _mm_sub_pd(iz2,jz3);
1881 dx31 = _mm_sub_pd(ix3,jx1);
1882 dy31 = _mm_sub_pd(iy3,jy1);
1883 dz31 = _mm_sub_pd(iz3,jz1);
1884 dx32 = _mm_sub_pd(ix3,jx2);
1885 dy32 = _mm_sub_pd(iy3,jy2);
1886 dz32 = _mm_sub_pd(iz3,jz2);
1887 dx33 = _mm_sub_pd(ix3,jx3);
1888 dy33 = _mm_sub_pd(iy3,jy3);
1889 dz33 = _mm_sub_pd(iz3,jz3);
1891 /* Calculate squared distance and things based on it */
1892 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1893 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1894 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1895 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1896 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1897 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1898 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1899 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1900 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1901 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1903 rinv00 = sse2_invsqrt_d(rsq00);
1904 rinv11 = sse2_invsqrt_d(rsq11);
1905 rinv12 = sse2_invsqrt_d(rsq12);
1906 rinv13 = sse2_invsqrt_d(rsq13);
1907 rinv21 = sse2_invsqrt_d(rsq21);
1908 rinv22 = sse2_invsqrt_d(rsq22);
1909 rinv23 = sse2_invsqrt_d(rsq23);
1910 rinv31 = sse2_invsqrt_d(rsq31);
1911 rinv32 = sse2_invsqrt_d(rsq32);
1912 rinv33 = sse2_invsqrt_d(rsq33);
1914 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1915 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1916 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1917 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1918 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1919 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1920 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1921 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1922 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1923 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1925 fjx0 = _mm_setzero_pd();
1926 fjy0 = _mm_setzero_pd();
1927 fjz0 = _mm_setzero_pd();
1928 fjx1 = _mm_setzero_pd();
1929 fjy1 = _mm_setzero_pd();
1930 fjz1 = _mm_setzero_pd();
1931 fjx2 = _mm_setzero_pd();
1932 fjy2 = _mm_setzero_pd();
1933 fjz2 = _mm_setzero_pd();
1934 fjx3 = _mm_setzero_pd();
1935 fjy3 = _mm_setzero_pd();
1936 fjz3 = _mm_setzero_pd();
1938 /**************************
1939 * CALCULATE INTERACTIONS *
1940 **************************/
1942 if (gmx_mm_any_lt(rsq00,rcutoff2))
1945 r00 = _mm_mul_pd(rsq00,rinv00);
1947 /* LENNARD-JONES DISPERSION/REPULSION */
1949 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1950 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1951 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1952 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1953 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1955 d = _mm_sub_pd(r00,rswitch);
1956 d = _mm_max_pd(d,_mm_setzero_pd());
1957 d2 = _mm_mul_pd(d,d);
1958 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1960 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1962 /* Evaluate switch function */
1963 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1964 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1965 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1969 fscal = _mm_and_pd(fscal,cutoff_mask);
1971 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1973 /* Calculate temporary vectorial force */
1974 tx = _mm_mul_pd(fscal,dx00);
1975 ty = _mm_mul_pd(fscal,dy00);
1976 tz = _mm_mul_pd(fscal,dz00);
1978 /* Update vectorial force */
1979 fix0 = _mm_add_pd(fix0,tx);
1980 fiy0 = _mm_add_pd(fiy0,ty);
1981 fiz0 = _mm_add_pd(fiz0,tz);
1983 fjx0 = _mm_add_pd(fjx0,tx);
1984 fjy0 = _mm_add_pd(fjy0,ty);
1985 fjz0 = _mm_add_pd(fjz0,tz);
1989 /**************************
1990 * CALCULATE INTERACTIONS *
1991 **************************/
1993 if (gmx_mm_any_lt(rsq11,rcutoff2))
1996 /* REACTION-FIELD ELECTROSTATICS */
1997 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1999 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2003 fscal = _mm_and_pd(fscal,cutoff_mask);
2005 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2007 /* Calculate temporary vectorial force */
2008 tx = _mm_mul_pd(fscal,dx11);
2009 ty = _mm_mul_pd(fscal,dy11);
2010 tz = _mm_mul_pd(fscal,dz11);
2012 /* Update vectorial force */
2013 fix1 = _mm_add_pd(fix1,tx);
2014 fiy1 = _mm_add_pd(fiy1,ty);
2015 fiz1 = _mm_add_pd(fiz1,tz);
2017 fjx1 = _mm_add_pd(fjx1,tx);
2018 fjy1 = _mm_add_pd(fjy1,ty);
2019 fjz1 = _mm_add_pd(fjz1,tz);
2023 /**************************
2024 * CALCULATE INTERACTIONS *
2025 **************************/
2027 if (gmx_mm_any_lt(rsq12,rcutoff2))
2030 /* REACTION-FIELD ELECTROSTATICS */
2031 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
2033 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2037 fscal = _mm_and_pd(fscal,cutoff_mask);
2039 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2041 /* Calculate temporary vectorial force */
2042 tx = _mm_mul_pd(fscal,dx12);
2043 ty = _mm_mul_pd(fscal,dy12);
2044 tz = _mm_mul_pd(fscal,dz12);
2046 /* Update vectorial force */
2047 fix1 = _mm_add_pd(fix1,tx);
2048 fiy1 = _mm_add_pd(fiy1,ty);
2049 fiz1 = _mm_add_pd(fiz1,tz);
2051 fjx2 = _mm_add_pd(fjx2,tx);
2052 fjy2 = _mm_add_pd(fjy2,ty);
2053 fjz2 = _mm_add_pd(fjz2,tz);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 if (gmx_mm_any_lt(rsq13,rcutoff2))
2064 /* REACTION-FIELD ELECTROSTATICS */
2065 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
2067 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2071 fscal = _mm_and_pd(fscal,cutoff_mask);
2073 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2075 /* Calculate temporary vectorial force */
2076 tx = _mm_mul_pd(fscal,dx13);
2077 ty = _mm_mul_pd(fscal,dy13);
2078 tz = _mm_mul_pd(fscal,dz13);
2080 /* Update vectorial force */
2081 fix1 = _mm_add_pd(fix1,tx);
2082 fiy1 = _mm_add_pd(fiy1,ty);
2083 fiz1 = _mm_add_pd(fiz1,tz);
2085 fjx3 = _mm_add_pd(fjx3,tx);
2086 fjy3 = _mm_add_pd(fjy3,ty);
2087 fjz3 = _mm_add_pd(fjz3,tz);
2091 /**************************
2092 * CALCULATE INTERACTIONS *
2093 **************************/
2095 if (gmx_mm_any_lt(rsq21,rcutoff2))
2098 /* REACTION-FIELD ELECTROSTATICS */
2099 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
2101 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2105 fscal = _mm_and_pd(fscal,cutoff_mask);
2107 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2109 /* Calculate temporary vectorial force */
2110 tx = _mm_mul_pd(fscal,dx21);
2111 ty = _mm_mul_pd(fscal,dy21);
2112 tz = _mm_mul_pd(fscal,dz21);
2114 /* Update vectorial force */
2115 fix2 = _mm_add_pd(fix2,tx);
2116 fiy2 = _mm_add_pd(fiy2,ty);
2117 fiz2 = _mm_add_pd(fiz2,tz);
2119 fjx1 = _mm_add_pd(fjx1,tx);
2120 fjy1 = _mm_add_pd(fjy1,ty);
2121 fjz1 = _mm_add_pd(fjz1,tz);
2125 /**************************
2126 * CALCULATE INTERACTIONS *
2127 **************************/
2129 if (gmx_mm_any_lt(rsq22,rcutoff2))
2132 /* REACTION-FIELD ELECTROSTATICS */
2133 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2135 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2139 fscal = _mm_and_pd(fscal,cutoff_mask);
2141 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2143 /* Calculate temporary vectorial force */
2144 tx = _mm_mul_pd(fscal,dx22);
2145 ty = _mm_mul_pd(fscal,dy22);
2146 tz = _mm_mul_pd(fscal,dz22);
2148 /* Update vectorial force */
2149 fix2 = _mm_add_pd(fix2,tx);
2150 fiy2 = _mm_add_pd(fiy2,ty);
2151 fiz2 = _mm_add_pd(fiz2,tz);
2153 fjx2 = _mm_add_pd(fjx2,tx);
2154 fjy2 = _mm_add_pd(fjy2,ty);
2155 fjz2 = _mm_add_pd(fjz2,tz);
2159 /**************************
2160 * CALCULATE INTERACTIONS *
2161 **************************/
2163 if (gmx_mm_any_lt(rsq23,rcutoff2))
2166 /* REACTION-FIELD ELECTROSTATICS */
2167 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
2169 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2173 fscal = _mm_and_pd(fscal,cutoff_mask);
2175 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2177 /* Calculate temporary vectorial force */
2178 tx = _mm_mul_pd(fscal,dx23);
2179 ty = _mm_mul_pd(fscal,dy23);
2180 tz = _mm_mul_pd(fscal,dz23);
2182 /* Update vectorial force */
2183 fix2 = _mm_add_pd(fix2,tx);
2184 fiy2 = _mm_add_pd(fiy2,ty);
2185 fiz2 = _mm_add_pd(fiz2,tz);
2187 fjx3 = _mm_add_pd(fjx3,tx);
2188 fjy3 = _mm_add_pd(fjy3,ty);
2189 fjz3 = _mm_add_pd(fjz3,tz);
2193 /**************************
2194 * CALCULATE INTERACTIONS *
2195 **************************/
2197 if (gmx_mm_any_lt(rsq31,rcutoff2))
2200 /* REACTION-FIELD ELECTROSTATICS */
2201 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
2203 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2207 fscal = _mm_and_pd(fscal,cutoff_mask);
2209 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2211 /* Calculate temporary vectorial force */
2212 tx = _mm_mul_pd(fscal,dx31);
2213 ty = _mm_mul_pd(fscal,dy31);
2214 tz = _mm_mul_pd(fscal,dz31);
2216 /* Update vectorial force */
2217 fix3 = _mm_add_pd(fix3,tx);
2218 fiy3 = _mm_add_pd(fiy3,ty);
2219 fiz3 = _mm_add_pd(fiz3,tz);
2221 fjx1 = _mm_add_pd(fjx1,tx);
2222 fjy1 = _mm_add_pd(fjy1,ty);
2223 fjz1 = _mm_add_pd(fjz1,tz);
2227 /**************************
2228 * CALCULATE INTERACTIONS *
2229 **************************/
2231 if (gmx_mm_any_lt(rsq32,rcutoff2))
2234 /* REACTION-FIELD ELECTROSTATICS */
2235 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
2237 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2241 fscal = _mm_and_pd(fscal,cutoff_mask);
2243 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2245 /* Calculate temporary vectorial force */
2246 tx = _mm_mul_pd(fscal,dx32);
2247 ty = _mm_mul_pd(fscal,dy32);
2248 tz = _mm_mul_pd(fscal,dz32);
2250 /* Update vectorial force */
2251 fix3 = _mm_add_pd(fix3,tx);
2252 fiy3 = _mm_add_pd(fiy3,ty);
2253 fiz3 = _mm_add_pd(fiz3,tz);
2255 fjx2 = _mm_add_pd(fjx2,tx);
2256 fjy2 = _mm_add_pd(fjy2,ty);
2257 fjz2 = _mm_add_pd(fjz2,tz);
2261 /**************************
2262 * CALCULATE INTERACTIONS *
2263 **************************/
2265 if (gmx_mm_any_lt(rsq33,rcutoff2))
2268 /* REACTION-FIELD ELECTROSTATICS */
2269 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
2271 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2275 fscal = _mm_and_pd(fscal,cutoff_mask);
2277 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2279 /* Calculate temporary vectorial force */
2280 tx = _mm_mul_pd(fscal,dx33);
2281 ty = _mm_mul_pd(fscal,dy33);
2282 tz = _mm_mul_pd(fscal,dz33);
2284 /* Update vectorial force */
2285 fix3 = _mm_add_pd(fix3,tx);
2286 fiy3 = _mm_add_pd(fiy3,ty);
2287 fiz3 = _mm_add_pd(fiz3,tz);
2289 fjx3 = _mm_add_pd(fjx3,tx);
2290 fjy3 = _mm_add_pd(fjy3,ty);
2291 fjz3 = _mm_add_pd(fjz3,tz);
2295 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2297 /* Inner loop uses 329 flops */
2300 /* End of innermost loop */
2302 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2303 f+i_coord_offset,fshift+i_shift_offset);
2305 /* Increment number of inner iterations */
2306 inneriter += j_index_end - j_index_start;
2308 /* Outer loop uses 24 flops */
2311 /* Increment number of outer iterations */
2314 /* Update outer/inner flops */
2316 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*329);