2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse2_double.h"
48 #include "kernelutil_x86_sse2_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 int vdwjidx3A,vdwjidx3B;
95 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
113 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
114 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
115 real rswitch_scalar,d_scalar;
116 __m128d dummy_mask,cutoff_mask;
117 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
118 __m128d one = _mm_set1_pd(1.0);
119 __m128d two = _mm_set1_pd(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_pd(fr->epsfac);
132 charge = mdatoms->chargeA;
133 krf = _mm_set1_pd(fr->ic->k_rf);
134 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
135 crf = _mm_set1_pd(fr->ic->c_rf);
136 nvdwtype = fr->ntype;
138 vdwtype = mdatoms->typeA;
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq1 = _mm_set1_pd(charge[inr+1]);
148 jq2 = _mm_set1_pd(charge[inr+2]);
149 jq3 = _mm_set1_pd(charge[inr+3]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153 qq11 = _mm_mul_pd(iq1,jq1);
154 qq12 = _mm_mul_pd(iq1,jq2);
155 qq13 = _mm_mul_pd(iq1,jq3);
156 qq21 = _mm_mul_pd(iq2,jq1);
157 qq22 = _mm_mul_pd(iq2,jq2);
158 qq23 = _mm_mul_pd(iq2,jq3);
159 qq31 = _mm_mul_pd(iq3,jq1);
160 qq32 = _mm_mul_pd(iq3,jq2);
161 qq33 = _mm_mul_pd(iq3,jq3);
163 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164 rcutoff_scalar = fr->rcoulomb;
165 rcutoff = _mm_set1_pd(rcutoff_scalar);
166 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
168 rswitch_scalar = fr->rvdw_switch;
169 rswitch = _mm_set1_pd(rswitch_scalar);
170 /* Setup switch parameters */
171 d_scalar = rcutoff_scalar-rswitch_scalar;
172 d = _mm_set1_pd(d_scalar);
173 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
174 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
175 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
177 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
180 /* Avoid stupid compiler warnings */
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
204 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
206 fix0 = _mm_setzero_pd();
207 fiy0 = _mm_setzero_pd();
208 fiz0 = _mm_setzero_pd();
209 fix1 = _mm_setzero_pd();
210 fiy1 = _mm_setzero_pd();
211 fiz1 = _mm_setzero_pd();
212 fix2 = _mm_setzero_pd();
213 fiy2 = _mm_setzero_pd();
214 fiz2 = _mm_setzero_pd();
215 fix3 = _mm_setzero_pd();
216 fiy3 = _mm_setzero_pd();
217 fiz3 = _mm_setzero_pd();
219 /* Reset potential sums */
220 velecsum = _mm_setzero_pd();
221 vvdwsum = _mm_setzero_pd();
223 /* Start inner kernel loop */
224 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
227 /* Get j neighbor index, and coordinate index */
230 j_coord_offsetA = DIM*jnrA;
231 j_coord_offsetB = DIM*jnrB;
233 /* load j atom coordinates */
234 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
235 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
236 &jy2,&jz2,&jx3,&jy3,&jz3);
238 /* Calculate displacement vector */
239 dx00 = _mm_sub_pd(ix0,jx0);
240 dy00 = _mm_sub_pd(iy0,jy0);
241 dz00 = _mm_sub_pd(iz0,jz0);
242 dx11 = _mm_sub_pd(ix1,jx1);
243 dy11 = _mm_sub_pd(iy1,jy1);
244 dz11 = _mm_sub_pd(iz1,jz1);
245 dx12 = _mm_sub_pd(ix1,jx2);
246 dy12 = _mm_sub_pd(iy1,jy2);
247 dz12 = _mm_sub_pd(iz1,jz2);
248 dx13 = _mm_sub_pd(ix1,jx3);
249 dy13 = _mm_sub_pd(iy1,jy3);
250 dz13 = _mm_sub_pd(iz1,jz3);
251 dx21 = _mm_sub_pd(ix2,jx1);
252 dy21 = _mm_sub_pd(iy2,jy1);
253 dz21 = _mm_sub_pd(iz2,jz1);
254 dx22 = _mm_sub_pd(ix2,jx2);
255 dy22 = _mm_sub_pd(iy2,jy2);
256 dz22 = _mm_sub_pd(iz2,jz2);
257 dx23 = _mm_sub_pd(ix2,jx3);
258 dy23 = _mm_sub_pd(iy2,jy3);
259 dz23 = _mm_sub_pd(iz2,jz3);
260 dx31 = _mm_sub_pd(ix3,jx1);
261 dy31 = _mm_sub_pd(iy3,jy1);
262 dz31 = _mm_sub_pd(iz3,jz1);
263 dx32 = _mm_sub_pd(ix3,jx2);
264 dy32 = _mm_sub_pd(iy3,jy2);
265 dz32 = _mm_sub_pd(iz3,jz2);
266 dx33 = _mm_sub_pd(ix3,jx3);
267 dy33 = _mm_sub_pd(iy3,jy3);
268 dz33 = _mm_sub_pd(iz3,jz3);
270 /* Calculate squared distance and things based on it */
271 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
272 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
273 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
274 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
275 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
276 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
277 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
278 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
279 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
280 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
282 rinv00 = gmx_mm_invsqrt_pd(rsq00);
283 rinv11 = gmx_mm_invsqrt_pd(rsq11);
284 rinv12 = gmx_mm_invsqrt_pd(rsq12);
285 rinv13 = gmx_mm_invsqrt_pd(rsq13);
286 rinv21 = gmx_mm_invsqrt_pd(rsq21);
287 rinv22 = gmx_mm_invsqrt_pd(rsq22);
288 rinv23 = gmx_mm_invsqrt_pd(rsq23);
289 rinv31 = gmx_mm_invsqrt_pd(rsq31);
290 rinv32 = gmx_mm_invsqrt_pd(rsq32);
291 rinv33 = gmx_mm_invsqrt_pd(rsq33);
293 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
294 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
295 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
296 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
297 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
298 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
299 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
300 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
301 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
302 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
304 fjx0 = _mm_setzero_pd();
305 fjy0 = _mm_setzero_pd();
306 fjz0 = _mm_setzero_pd();
307 fjx1 = _mm_setzero_pd();
308 fjy1 = _mm_setzero_pd();
309 fjz1 = _mm_setzero_pd();
310 fjx2 = _mm_setzero_pd();
311 fjy2 = _mm_setzero_pd();
312 fjz2 = _mm_setzero_pd();
313 fjx3 = _mm_setzero_pd();
314 fjy3 = _mm_setzero_pd();
315 fjz3 = _mm_setzero_pd();
317 /**************************
318 * CALCULATE INTERACTIONS *
319 **************************/
321 if (gmx_mm_any_lt(rsq00,rcutoff2))
324 r00 = _mm_mul_pd(rsq00,rinv00);
326 /* LENNARD-JONES DISPERSION/REPULSION */
328 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
329 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
330 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
331 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
332 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
334 d = _mm_sub_pd(r00,rswitch);
335 d = _mm_max_pd(d,_mm_setzero_pd());
336 d2 = _mm_mul_pd(d,d);
337 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
339 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
341 /* Evaluate switch function */
342 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
343 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
344 vvdw = _mm_mul_pd(vvdw,sw);
345 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
347 /* Update potential sum for this i atom from the interaction with this j atom. */
348 vvdw = _mm_and_pd(vvdw,cutoff_mask);
349 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
353 fscal = _mm_and_pd(fscal,cutoff_mask);
355 /* Calculate temporary vectorial force */
356 tx = _mm_mul_pd(fscal,dx00);
357 ty = _mm_mul_pd(fscal,dy00);
358 tz = _mm_mul_pd(fscal,dz00);
360 /* Update vectorial force */
361 fix0 = _mm_add_pd(fix0,tx);
362 fiy0 = _mm_add_pd(fiy0,ty);
363 fiz0 = _mm_add_pd(fiz0,tz);
365 fjx0 = _mm_add_pd(fjx0,tx);
366 fjy0 = _mm_add_pd(fjy0,ty);
367 fjz0 = _mm_add_pd(fjz0,tz);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 if (gmx_mm_any_lt(rsq11,rcutoff2))
378 /* REACTION-FIELD ELECTROSTATICS */
379 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
380 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
382 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
384 /* Update potential sum for this i atom from the interaction with this j atom. */
385 velec = _mm_and_pd(velec,cutoff_mask);
386 velecsum = _mm_add_pd(velecsum,velec);
390 fscal = _mm_and_pd(fscal,cutoff_mask);
392 /* Calculate temporary vectorial force */
393 tx = _mm_mul_pd(fscal,dx11);
394 ty = _mm_mul_pd(fscal,dy11);
395 tz = _mm_mul_pd(fscal,dz11);
397 /* Update vectorial force */
398 fix1 = _mm_add_pd(fix1,tx);
399 fiy1 = _mm_add_pd(fiy1,ty);
400 fiz1 = _mm_add_pd(fiz1,tz);
402 fjx1 = _mm_add_pd(fjx1,tx);
403 fjy1 = _mm_add_pd(fjy1,ty);
404 fjz1 = _mm_add_pd(fjz1,tz);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 if (gmx_mm_any_lt(rsq12,rcutoff2))
415 /* REACTION-FIELD ELECTROSTATICS */
416 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
417 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
419 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
421 /* Update potential sum for this i atom from the interaction with this j atom. */
422 velec = _mm_and_pd(velec,cutoff_mask);
423 velecsum = _mm_add_pd(velecsum,velec);
427 fscal = _mm_and_pd(fscal,cutoff_mask);
429 /* Calculate temporary vectorial force */
430 tx = _mm_mul_pd(fscal,dx12);
431 ty = _mm_mul_pd(fscal,dy12);
432 tz = _mm_mul_pd(fscal,dz12);
434 /* Update vectorial force */
435 fix1 = _mm_add_pd(fix1,tx);
436 fiy1 = _mm_add_pd(fiy1,ty);
437 fiz1 = _mm_add_pd(fiz1,tz);
439 fjx2 = _mm_add_pd(fjx2,tx);
440 fjy2 = _mm_add_pd(fjy2,ty);
441 fjz2 = _mm_add_pd(fjz2,tz);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 if (gmx_mm_any_lt(rsq13,rcutoff2))
452 /* REACTION-FIELD ELECTROSTATICS */
453 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
454 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
456 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velec = _mm_and_pd(velec,cutoff_mask);
460 velecsum = _mm_add_pd(velecsum,velec);
464 fscal = _mm_and_pd(fscal,cutoff_mask);
466 /* Calculate temporary vectorial force */
467 tx = _mm_mul_pd(fscal,dx13);
468 ty = _mm_mul_pd(fscal,dy13);
469 tz = _mm_mul_pd(fscal,dz13);
471 /* Update vectorial force */
472 fix1 = _mm_add_pd(fix1,tx);
473 fiy1 = _mm_add_pd(fiy1,ty);
474 fiz1 = _mm_add_pd(fiz1,tz);
476 fjx3 = _mm_add_pd(fjx3,tx);
477 fjy3 = _mm_add_pd(fjy3,ty);
478 fjz3 = _mm_add_pd(fjz3,tz);
482 /**************************
483 * CALCULATE INTERACTIONS *
484 **************************/
486 if (gmx_mm_any_lt(rsq21,rcutoff2))
489 /* REACTION-FIELD ELECTROSTATICS */
490 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
491 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
493 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velec = _mm_and_pd(velec,cutoff_mask);
497 velecsum = _mm_add_pd(velecsum,velec);
501 fscal = _mm_and_pd(fscal,cutoff_mask);
503 /* Calculate temporary vectorial force */
504 tx = _mm_mul_pd(fscal,dx21);
505 ty = _mm_mul_pd(fscal,dy21);
506 tz = _mm_mul_pd(fscal,dz21);
508 /* Update vectorial force */
509 fix2 = _mm_add_pd(fix2,tx);
510 fiy2 = _mm_add_pd(fiy2,ty);
511 fiz2 = _mm_add_pd(fiz2,tz);
513 fjx1 = _mm_add_pd(fjx1,tx);
514 fjy1 = _mm_add_pd(fjy1,ty);
515 fjz1 = _mm_add_pd(fjz1,tz);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 if (gmx_mm_any_lt(rsq22,rcutoff2))
526 /* REACTION-FIELD ELECTROSTATICS */
527 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
528 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
530 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
532 /* Update potential sum for this i atom from the interaction with this j atom. */
533 velec = _mm_and_pd(velec,cutoff_mask);
534 velecsum = _mm_add_pd(velecsum,velec);
538 fscal = _mm_and_pd(fscal,cutoff_mask);
540 /* Calculate temporary vectorial force */
541 tx = _mm_mul_pd(fscal,dx22);
542 ty = _mm_mul_pd(fscal,dy22);
543 tz = _mm_mul_pd(fscal,dz22);
545 /* Update vectorial force */
546 fix2 = _mm_add_pd(fix2,tx);
547 fiy2 = _mm_add_pd(fiy2,ty);
548 fiz2 = _mm_add_pd(fiz2,tz);
550 fjx2 = _mm_add_pd(fjx2,tx);
551 fjy2 = _mm_add_pd(fjy2,ty);
552 fjz2 = _mm_add_pd(fjz2,tz);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 if (gmx_mm_any_lt(rsq23,rcutoff2))
563 /* REACTION-FIELD ELECTROSTATICS */
564 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
565 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
567 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velec = _mm_and_pd(velec,cutoff_mask);
571 velecsum = _mm_add_pd(velecsum,velec);
575 fscal = _mm_and_pd(fscal,cutoff_mask);
577 /* Calculate temporary vectorial force */
578 tx = _mm_mul_pd(fscal,dx23);
579 ty = _mm_mul_pd(fscal,dy23);
580 tz = _mm_mul_pd(fscal,dz23);
582 /* Update vectorial force */
583 fix2 = _mm_add_pd(fix2,tx);
584 fiy2 = _mm_add_pd(fiy2,ty);
585 fiz2 = _mm_add_pd(fiz2,tz);
587 fjx3 = _mm_add_pd(fjx3,tx);
588 fjy3 = _mm_add_pd(fjy3,ty);
589 fjz3 = _mm_add_pd(fjz3,tz);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 if (gmx_mm_any_lt(rsq31,rcutoff2))
600 /* REACTION-FIELD ELECTROSTATICS */
601 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
602 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
604 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
606 /* Update potential sum for this i atom from the interaction with this j atom. */
607 velec = _mm_and_pd(velec,cutoff_mask);
608 velecsum = _mm_add_pd(velecsum,velec);
612 fscal = _mm_and_pd(fscal,cutoff_mask);
614 /* Calculate temporary vectorial force */
615 tx = _mm_mul_pd(fscal,dx31);
616 ty = _mm_mul_pd(fscal,dy31);
617 tz = _mm_mul_pd(fscal,dz31);
619 /* Update vectorial force */
620 fix3 = _mm_add_pd(fix3,tx);
621 fiy3 = _mm_add_pd(fiy3,ty);
622 fiz3 = _mm_add_pd(fiz3,tz);
624 fjx1 = _mm_add_pd(fjx1,tx);
625 fjy1 = _mm_add_pd(fjy1,ty);
626 fjz1 = _mm_add_pd(fjz1,tz);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 if (gmx_mm_any_lt(rsq32,rcutoff2))
637 /* REACTION-FIELD ELECTROSTATICS */
638 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
639 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
641 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
643 /* Update potential sum for this i atom from the interaction with this j atom. */
644 velec = _mm_and_pd(velec,cutoff_mask);
645 velecsum = _mm_add_pd(velecsum,velec);
649 fscal = _mm_and_pd(fscal,cutoff_mask);
651 /* Calculate temporary vectorial force */
652 tx = _mm_mul_pd(fscal,dx32);
653 ty = _mm_mul_pd(fscal,dy32);
654 tz = _mm_mul_pd(fscal,dz32);
656 /* Update vectorial force */
657 fix3 = _mm_add_pd(fix3,tx);
658 fiy3 = _mm_add_pd(fiy3,ty);
659 fiz3 = _mm_add_pd(fiz3,tz);
661 fjx2 = _mm_add_pd(fjx2,tx);
662 fjy2 = _mm_add_pd(fjy2,ty);
663 fjz2 = _mm_add_pd(fjz2,tz);
667 /**************************
668 * CALCULATE INTERACTIONS *
669 **************************/
671 if (gmx_mm_any_lt(rsq33,rcutoff2))
674 /* REACTION-FIELD ELECTROSTATICS */
675 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
676 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
678 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
680 /* Update potential sum for this i atom from the interaction with this j atom. */
681 velec = _mm_and_pd(velec,cutoff_mask);
682 velecsum = _mm_add_pd(velecsum,velec);
686 fscal = _mm_and_pd(fscal,cutoff_mask);
688 /* Calculate temporary vectorial force */
689 tx = _mm_mul_pd(fscal,dx33);
690 ty = _mm_mul_pd(fscal,dy33);
691 tz = _mm_mul_pd(fscal,dz33);
693 /* Update vectorial force */
694 fix3 = _mm_add_pd(fix3,tx);
695 fiy3 = _mm_add_pd(fiy3,ty);
696 fiz3 = _mm_add_pd(fiz3,tz);
698 fjx3 = _mm_add_pd(fjx3,tx);
699 fjy3 = _mm_add_pd(fjy3,ty);
700 fjz3 = _mm_add_pd(fjz3,tz);
704 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
706 /* Inner loop uses 386 flops */
713 j_coord_offsetA = DIM*jnrA;
715 /* load j atom coordinates */
716 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
717 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
718 &jy2,&jz2,&jx3,&jy3,&jz3);
720 /* Calculate displacement vector */
721 dx00 = _mm_sub_pd(ix0,jx0);
722 dy00 = _mm_sub_pd(iy0,jy0);
723 dz00 = _mm_sub_pd(iz0,jz0);
724 dx11 = _mm_sub_pd(ix1,jx1);
725 dy11 = _mm_sub_pd(iy1,jy1);
726 dz11 = _mm_sub_pd(iz1,jz1);
727 dx12 = _mm_sub_pd(ix1,jx2);
728 dy12 = _mm_sub_pd(iy1,jy2);
729 dz12 = _mm_sub_pd(iz1,jz2);
730 dx13 = _mm_sub_pd(ix1,jx3);
731 dy13 = _mm_sub_pd(iy1,jy3);
732 dz13 = _mm_sub_pd(iz1,jz3);
733 dx21 = _mm_sub_pd(ix2,jx1);
734 dy21 = _mm_sub_pd(iy2,jy1);
735 dz21 = _mm_sub_pd(iz2,jz1);
736 dx22 = _mm_sub_pd(ix2,jx2);
737 dy22 = _mm_sub_pd(iy2,jy2);
738 dz22 = _mm_sub_pd(iz2,jz2);
739 dx23 = _mm_sub_pd(ix2,jx3);
740 dy23 = _mm_sub_pd(iy2,jy3);
741 dz23 = _mm_sub_pd(iz2,jz3);
742 dx31 = _mm_sub_pd(ix3,jx1);
743 dy31 = _mm_sub_pd(iy3,jy1);
744 dz31 = _mm_sub_pd(iz3,jz1);
745 dx32 = _mm_sub_pd(ix3,jx2);
746 dy32 = _mm_sub_pd(iy3,jy2);
747 dz32 = _mm_sub_pd(iz3,jz2);
748 dx33 = _mm_sub_pd(ix3,jx3);
749 dy33 = _mm_sub_pd(iy3,jy3);
750 dz33 = _mm_sub_pd(iz3,jz3);
752 /* Calculate squared distance and things based on it */
753 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
754 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
755 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
756 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
757 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
758 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
759 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
760 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
761 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
762 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
764 rinv00 = gmx_mm_invsqrt_pd(rsq00);
765 rinv11 = gmx_mm_invsqrt_pd(rsq11);
766 rinv12 = gmx_mm_invsqrt_pd(rsq12);
767 rinv13 = gmx_mm_invsqrt_pd(rsq13);
768 rinv21 = gmx_mm_invsqrt_pd(rsq21);
769 rinv22 = gmx_mm_invsqrt_pd(rsq22);
770 rinv23 = gmx_mm_invsqrt_pd(rsq23);
771 rinv31 = gmx_mm_invsqrt_pd(rsq31);
772 rinv32 = gmx_mm_invsqrt_pd(rsq32);
773 rinv33 = gmx_mm_invsqrt_pd(rsq33);
775 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
776 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
777 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
778 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
779 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
780 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
781 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
782 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
783 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
784 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
786 fjx0 = _mm_setzero_pd();
787 fjy0 = _mm_setzero_pd();
788 fjz0 = _mm_setzero_pd();
789 fjx1 = _mm_setzero_pd();
790 fjy1 = _mm_setzero_pd();
791 fjz1 = _mm_setzero_pd();
792 fjx2 = _mm_setzero_pd();
793 fjy2 = _mm_setzero_pd();
794 fjz2 = _mm_setzero_pd();
795 fjx3 = _mm_setzero_pd();
796 fjy3 = _mm_setzero_pd();
797 fjz3 = _mm_setzero_pd();
799 /**************************
800 * CALCULATE INTERACTIONS *
801 **************************/
803 if (gmx_mm_any_lt(rsq00,rcutoff2))
806 r00 = _mm_mul_pd(rsq00,rinv00);
808 /* LENNARD-JONES DISPERSION/REPULSION */
810 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
811 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
812 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
813 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
814 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
816 d = _mm_sub_pd(r00,rswitch);
817 d = _mm_max_pd(d,_mm_setzero_pd());
818 d2 = _mm_mul_pd(d,d);
819 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
821 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
823 /* Evaluate switch function */
824 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
825 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
826 vvdw = _mm_mul_pd(vvdw,sw);
827 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
829 /* Update potential sum for this i atom from the interaction with this j atom. */
830 vvdw = _mm_and_pd(vvdw,cutoff_mask);
831 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
832 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
836 fscal = _mm_and_pd(fscal,cutoff_mask);
838 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
840 /* Calculate temporary vectorial force */
841 tx = _mm_mul_pd(fscal,dx00);
842 ty = _mm_mul_pd(fscal,dy00);
843 tz = _mm_mul_pd(fscal,dz00);
845 /* Update vectorial force */
846 fix0 = _mm_add_pd(fix0,tx);
847 fiy0 = _mm_add_pd(fiy0,ty);
848 fiz0 = _mm_add_pd(fiz0,tz);
850 fjx0 = _mm_add_pd(fjx0,tx);
851 fjy0 = _mm_add_pd(fjy0,ty);
852 fjz0 = _mm_add_pd(fjz0,tz);
856 /**************************
857 * CALCULATE INTERACTIONS *
858 **************************/
860 if (gmx_mm_any_lt(rsq11,rcutoff2))
863 /* REACTION-FIELD ELECTROSTATICS */
864 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
865 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
867 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
869 /* Update potential sum for this i atom from the interaction with this j atom. */
870 velec = _mm_and_pd(velec,cutoff_mask);
871 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
872 velecsum = _mm_add_pd(velecsum,velec);
876 fscal = _mm_and_pd(fscal,cutoff_mask);
878 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
880 /* Calculate temporary vectorial force */
881 tx = _mm_mul_pd(fscal,dx11);
882 ty = _mm_mul_pd(fscal,dy11);
883 tz = _mm_mul_pd(fscal,dz11);
885 /* Update vectorial force */
886 fix1 = _mm_add_pd(fix1,tx);
887 fiy1 = _mm_add_pd(fiy1,ty);
888 fiz1 = _mm_add_pd(fiz1,tz);
890 fjx1 = _mm_add_pd(fjx1,tx);
891 fjy1 = _mm_add_pd(fjy1,ty);
892 fjz1 = _mm_add_pd(fjz1,tz);
896 /**************************
897 * CALCULATE INTERACTIONS *
898 **************************/
900 if (gmx_mm_any_lt(rsq12,rcutoff2))
903 /* REACTION-FIELD ELECTROSTATICS */
904 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
905 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
907 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
909 /* Update potential sum for this i atom from the interaction with this j atom. */
910 velec = _mm_and_pd(velec,cutoff_mask);
911 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
912 velecsum = _mm_add_pd(velecsum,velec);
916 fscal = _mm_and_pd(fscal,cutoff_mask);
918 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
920 /* Calculate temporary vectorial force */
921 tx = _mm_mul_pd(fscal,dx12);
922 ty = _mm_mul_pd(fscal,dy12);
923 tz = _mm_mul_pd(fscal,dz12);
925 /* Update vectorial force */
926 fix1 = _mm_add_pd(fix1,tx);
927 fiy1 = _mm_add_pd(fiy1,ty);
928 fiz1 = _mm_add_pd(fiz1,tz);
930 fjx2 = _mm_add_pd(fjx2,tx);
931 fjy2 = _mm_add_pd(fjy2,ty);
932 fjz2 = _mm_add_pd(fjz2,tz);
936 /**************************
937 * CALCULATE INTERACTIONS *
938 **************************/
940 if (gmx_mm_any_lt(rsq13,rcutoff2))
943 /* REACTION-FIELD ELECTROSTATICS */
944 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
945 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
947 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm_and_pd(velec,cutoff_mask);
951 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
952 velecsum = _mm_add_pd(velecsum,velec);
956 fscal = _mm_and_pd(fscal,cutoff_mask);
958 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
960 /* Calculate temporary vectorial force */
961 tx = _mm_mul_pd(fscal,dx13);
962 ty = _mm_mul_pd(fscal,dy13);
963 tz = _mm_mul_pd(fscal,dz13);
965 /* Update vectorial force */
966 fix1 = _mm_add_pd(fix1,tx);
967 fiy1 = _mm_add_pd(fiy1,ty);
968 fiz1 = _mm_add_pd(fiz1,tz);
970 fjx3 = _mm_add_pd(fjx3,tx);
971 fjy3 = _mm_add_pd(fjy3,ty);
972 fjz3 = _mm_add_pd(fjz3,tz);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 if (gmx_mm_any_lt(rsq21,rcutoff2))
983 /* REACTION-FIELD ELECTROSTATICS */
984 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
985 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
987 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
989 /* Update potential sum for this i atom from the interaction with this j atom. */
990 velec = _mm_and_pd(velec,cutoff_mask);
991 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
992 velecsum = _mm_add_pd(velecsum,velec);
996 fscal = _mm_and_pd(fscal,cutoff_mask);
998 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1000 /* Calculate temporary vectorial force */
1001 tx = _mm_mul_pd(fscal,dx21);
1002 ty = _mm_mul_pd(fscal,dy21);
1003 tz = _mm_mul_pd(fscal,dz21);
1005 /* Update vectorial force */
1006 fix2 = _mm_add_pd(fix2,tx);
1007 fiy2 = _mm_add_pd(fiy2,ty);
1008 fiz2 = _mm_add_pd(fiz2,tz);
1010 fjx1 = _mm_add_pd(fjx1,tx);
1011 fjy1 = _mm_add_pd(fjy1,ty);
1012 fjz1 = _mm_add_pd(fjz1,tz);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 if (gmx_mm_any_lt(rsq22,rcutoff2))
1023 /* REACTION-FIELD ELECTROSTATICS */
1024 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1025 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1027 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1029 /* Update potential sum for this i atom from the interaction with this j atom. */
1030 velec = _mm_and_pd(velec,cutoff_mask);
1031 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1032 velecsum = _mm_add_pd(velecsum,velec);
1036 fscal = _mm_and_pd(fscal,cutoff_mask);
1038 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1040 /* Calculate temporary vectorial force */
1041 tx = _mm_mul_pd(fscal,dx22);
1042 ty = _mm_mul_pd(fscal,dy22);
1043 tz = _mm_mul_pd(fscal,dz22);
1045 /* Update vectorial force */
1046 fix2 = _mm_add_pd(fix2,tx);
1047 fiy2 = _mm_add_pd(fiy2,ty);
1048 fiz2 = _mm_add_pd(fiz2,tz);
1050 fjx2 = _mm_add_pd(fjx2,tx);
1051 fjy2 = _mm_add_pd(fjy2,ty);
1052 fjz2 = _mm_add_pd(fjz2,tz);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 if (gmx_mm_any_lt(rsq23,rcutoff2))
1063 /* REACTION-FIELD ELECTROSTATICS */
1064 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
1065 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1067 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1069 /* Update potential sum for this i atom from the interaction with this j atom. */
1070 velec = _mm_and_pd(velec,cutoff_mask);
1071 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1072 velecsum = _mm_add_pd(velecsum,velec);
1076 fscal = _mm_and_pd(fscal,cutoff_mask);
1078 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1080 /* Calculate temporary vectorial force */
1081 tx = _mm_mul_pd(fscal,dx23);
1082 ty = _mm_mul_pd(fscal,dy23);
1083 tz = _mm_mul_pd(fscal,dz23);
1085 /* Update vectorial force */
1086 fix2 = _mm_add_pd(fix2,tx);
1087 fiy2 = _mm_add_pd(fiy2,ty);
1088 fiz2 = _mm_add_pd(fiz2,tz);
1090 fjx3 = _mm_add_pd(fjx3,tx);
1091 fjy3 = _mm_add_pd(fjy3,ty);
1092 fjz3 = _mm_add_pd(fjz3,tz);
1096 /**************************
1097 * CALCULATE INTERACTIONS *
1098 **************************/
1100 if (gmx_mm_any_lt(rsq31,rcutoff2))
1103 /* REACTION-FIELD ELECTROSTATICS */
1104 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
1105 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1107 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1109 /* Update potential sum for this i atom from the interaction with this j atom. */
1110 velec = _mm_and_pd(velec,cutoff_mask);
1111 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1112 velecsum = _mm_add_pd(velecsum,velec);
1116 fscal = _mm_and_pd(fscal,cutoff_mask);
1118 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1120 /* Calculate temporary vectorial force */
1121 tx = _mm_mul_pd(fscal,dx31);
1122 ty = _mm_mul_pd(fscal,dy31);
1123 tz = _mm_mul_pd(fscal,dz31);
1125 /* Update vectorial force */
1126 fix3 = _mm_add_pd(fix3,tx);
1127 fiy3 = _mm_add_pd(fiy3,ty);
1128 fiz3 = _mm_add_pd(fiz3,tz);
1130 fjx1 = _mm_add_pd(fjx1,tx);
1131 fjy1 = _mm_add_pd(fjy1,ty);
1132 fjz1 = _mm_add_pd(fjz1,tz);
1136 /**************************
1137 * CALCULATE INTERACTIONS *
1138 **************************/
1140 if (gmx_mm_any_lt(rsq32,rcutoff2))
1143 /* REACTION-FIELD ELECTROSTATICS */
1144 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
1145 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1147 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1149 /* Update potential sum for this i atom from the interaction with this j atom. */
1150 velec = _mm_and_pd(velec,cutoff_mask);
1151 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1152 velecsum = _mm_add_pd(velecsum,velec);
1156 fscal = _mm_and_pd(fscal,cutoff_mask);
1158 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1160 /* Calculate temporary vectorial force */
1161 tx = _mm_mul_pd(fscal,dx32);
1162 ty = _mm_mul_pd(fscal,dy32);
1163 tz = _mm_mul_pd(fscal,dz32);
1165 /* Update vectorial force */
1166 fix3 = _mm_add_pd(fix3,tx);
1167 fiy3 = _mm_add_pd(fiy3,ty);
1168 fiz3 = _mm_add_pd(fiz3,tz);
1170 fjx2 = _mm_add_pd(fjx2,tx);
1171 fjy2 = _mm_add_pd(fjy2,ty);
1172 fjz2 = _mm_add_pd(fjz2,tz);
1176 /**************************
1177 * CALCULATE INTERACTIONS *
1178 **************************/
1180 if (gmx_mm_any_lt(rsq33,rcutoff2))
1183 /* REACTION-FIELD ELECTROSTATICS */
1184 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
1185 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1187 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1189 /* Update potential sum for this i atom from the interaction with this j atom. */
1190 velec = _mm_and_pd(velec,cutoff_mask);
1191 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1192 velecsum = _mm_add_pd(velecsum,velec);
1196 fscal = _mm_and_pd(fscal,cutoff_mask);
1198 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1200 /* Calculate temporary vectorial force */
1201 tx = _mm_mul_pd(fscal,dx33);
1202 ty = _mm_mul_pd(fscal,dy33);
1203 tz = _mm_mul_pd(fscal,dz33);
1205 /* Update vectorial force */
1206 fix3 = _mm_add_pd(fix3,tx);
1207 fiy3 = _mm_add_pd(fiy3,ty);
1208 fiz3 = _mm_add_pd(fiz3,tz);
1210 fjx3 = _mm_add_pd(fjx3,tx);
1211 fjy3 = _mm_add_pd(fjy3,ty);
1212 fjz3 = _mm_add_pd(fjz3,tz);
1216 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1218 /* Inner loop uses 386 flops */
1221 /* End of innermost loop */
1223 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1224 f+i_coord_offset,fshift+i_shift_offset);
1227 /* Update potential energies */
1228 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1229 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1231 /* Increment number of inner iterations */
1232 inneriter += j_index_end - j_index_start;
1234 /* Outer loop uses 26 flops */
1237 /* Increment number of outer iterations */
1240 /* Update outer/inner flops */
1242 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*386);
1245 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double
1246 * Electrostatics interaction: ReactionField
1247 * VdW interaction: LennardJones
1248 * Geometry: Water4-Water4
1249 * Calculate force/pot: Force
1252 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double
1253 (t_nblist * gmx_restrict nlist,
1254 rvec * gmx_restrict xx,
1255 rvec * gmx_restrict ff,
1256 t_forcerec * gmx_restrict fr,
1257 t_mdatoms * gmx_restrict mdatoms,
1258 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1259 t_nrnb * gmx_restrict nrnb)
1261 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1262 * just 0 for non-waters.
1263 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1264 * jnr indices corresponding to data put in the four positions in the SIMD register.
1266 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1267 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1269 int j_coord_offsetA,j_coord_offsetB;
1270 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1271 real rcutoff_scalar;
1272 real *shiftvec,*fshift,*x,*f;
1273 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1275 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1277 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1279 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1281 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1282 int vdwjidx0A,vdwjidx0B;
1283 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1284 int vdwjidx1A,vdwjidx1B;
1285 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1286 int vdwjidx2A,vdwjidx2B;
1287 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1288 int vdwjidx3A,vdwjidx3B;
1289 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1290 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1291 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1292 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1293 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1294 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1295 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1296 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1297 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1298 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1299 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1300 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1303 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1306 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1307 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1308 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1309 real rswitch_scalar,d_scalar;
1310 __m128d dummy_mask,cutoff_mask;
1311 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1312 __m128d one = _mm_set1_pd(1.0);
1313 __m128d two = _mm_set1_pd(2.0);
1319 jindex = nlist->jindex;
1321 shiftidx = nlist->shift;
1323 shiftvec = fr->shift_vec[0];
1324 fshift = fr->fshift[0];
1325 facel = _mm_set1_pd(fr->epsfac);
1326 charge = mdatoms->chargeA;
1327 krf = _mm_set1_pd(fr->ic->k_rf);
1328 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1329 crf = _mm_set1_pd(fr->ic->c_rf);
1330 nvdwtype = fr->ntype;
1331 vdwparam = fr->nbfp;
1332 vdwtype = mdatoms->typeA;
1334 /* Setup water-specific parameters */
1335 inr = nlist->iinr[0];
1336 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1337 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1338 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1339 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1341 jq1 = _mm_set1_pd(charge[inr+1]);
1342 jq2 = _mm_set1_pd(charge[inr+2]);
1343 jq3 = _mm_set1_pd(charge[inr+3]);
1344 vdwjidx0A = 2*vdwtype[inr+0];
1345 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1346 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1347 qq11 = _mm_mul_pd(iq1,jq1);
1348 qq12 = _mm_mul_pd(iq1,jq2);
1349 qq13 = _mm_mul_pd(iq1,jq3);
1350 qq21 = _mm_mul_pd(iq2,jq1);
1351 qq22 = _mm_mul_pd(iq2,jq2);
1352 qq23 = _mm_mul_pd(iq2,jq3);
1353 qq31 = _mm_mul_pd(iq3,jq1);
1354 qq32 = _mm_mul_pd(iq3,jq2);
1355 qq33 = _mm_mul_pd(iq3,jq3);
1357 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1358 rcutoff_scalar = fr->rcoulomb;
1359 rcutoff = _mm_set1_pd(rcutoff_scalar);
1360 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1362 rswitch_scalar = fr->rvdw_switch;
1363 rswitch = _mm_set1_pd(rswitch_scalar);
1364 /* Setup switch parameters */
1365 d_scalar = rcutoff_scalar-rswitch_scalar;
1366 d = _mm_set1_pd(d_scalar);
1367 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1368 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1369 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1370 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1371 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1372 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1374 /* Avoid stupid compiler warnings */
1376 j_coord_offsetA = 0;
1377 j_coord_offsetB = 0;
1382 /* Start outer loop over neighborlists */
1383 for(iidx=0; iidx<nri; iidx++)
1385 /* Load shift vector for this list */
1386 i_shift_offset = DIM*shiftidx[iidx];
1388 /* Load limits for loop over neighbors */
1389 j_index_start = jindex[iidx];
1390 j_index_end = jindex[iidx+1];
1392 /* Get outer coordinate index */
1394 i_coord_offset = DIM*inr;
1396 /* Load i particle coords and add shift vector */
1397 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1398 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1400 fix0 = _mm_setzero_pd();
1401 fiy0 = _mm_setzero_pd();
1402 fiz0 = _mm_setzero_pd();
1403 fix1 = _mm_setzero_pd();
1404 fiy1 = _mm_setzero_pd();
1405 fiz1 = _mm_setzero_pd();
1406 fix2 = _mm_setzero_pd();
1407 fiy2 = _mm_setzero_pd();
1408 fiz2 = _mm_setzero_pd();
1409 fix3 = _mm_setzero_pd();
1410 fiy3 = _mm_setzero_pd();
1411 fiz3 = _mm_setzero_pd();
1413 /* Start inner kernel loop */
1414 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1417 /* Get j neighbor index, and coordinate index */
1419 jnrB = jjnr[jidx+1];
1420 j_coord_offsetA = DIM*jnrA;
1421 j_coord_offsetB = DIM*jnrB;
1423 /* load j atom coordinates */
1424 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1425 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1426 &jy2,&jz2,&jx3,&jy3,&jz3);
1428 /* Calculate displacement vector */
1429 dx00 = _mm_sub_pd(ix0,jx0);
1430 dy00 = _mm_sub_pd(iy0,jy0);
1431 dz00 = _mm_sub_pd(iz0,jz0);
1432 dx11 = _mm_sub_pd(ix1,jx1);
1433 dy11 = _mm_sub_pd(iy1,jy1);
1434 dz11 = _mm_sub_pd(iz1,jz1);
1435 dx12 = _mm_sub_pd(ix1,jx2);
1436 dy12 = _mm_sub_pd(iy1,jy2);
1437 dz12 = _mm_sub_pd(iz1,jz2);
1438 dx13 = _mm_sub_pd(ix1,jx3);
1439 dy13 = _mm_sub_pd(iy1,jy3);
1440 dz13 = _mm_sub_pd(iz1,jz3);
1441 dx21 = _mm_sub_pd(ix2,jx1);
1442 dy21 = _mm_sub_pd(iy2,jy1);
1443 dz21 = _mm_sub_pd(iz2,jz1);
1444 dx22 = _mm_sub_pd(ix2,jx2);
1445 dy22 = _mm_sub_pd(iy2,jy2);
1446 dz22 = _mm_sub_pd(iz2,jz2);
1447 dx23 = _mm_sub_pd(ix2,jx3);
1448 dy23 = _mm_sub_pd(iy2,jy3);
1449 dz23 = _mm_sub_pd(iz2,jz3);
1450 dx31 = _mm_sub_pd(ix3,jx1);
1451 dy31 = _mm_sub_pd(iy3,jy1);
1452 dz31 = _mm_sub_pd(iz3,jz1);
1453 dx32 = _mm_sub_pd(ix3,jx2);
1454 dy32 = _mm_sub_pd(iy3,jy2);
1455 dz32 = _mm_sub_pd(iz3,jz2);
1456 dx33 = _mm_sub_pd(ix3,jx3);
1457 dy33 = _mm_sub_pd(iy3,jy3);
1458 dz33 = _mm_sub_pd(iz3,jz3);
1460 /* Calculate squared distance and things based on it */
1461 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1462 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1463 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1464 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1465 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1466 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1467 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1468 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1469 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1470 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1472 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1473 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1474 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1475 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1476 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1477 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1478 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1479 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1480 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1481 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1483 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1484 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1485 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1486 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1487 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1488 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1489 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1490 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1491 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1492 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1494 fjx0 = _mm_setzero_pd();
1495 fjy0 = _mm_setzero_pd();
1496 fjz0 = _mm_setzero_pd();
1497 fjx1 = _mm_setzero_pd();
1498 fjy1 = _mm_setzero_pd();
1499 fjz1 = _mm_setzero_pd();
1500 fjx2 = _mm_setzero_pd();
1501 fjy2 = _mm_setzero_pd();
1502 fjz2 = _mm_setzero_pd();
1503 fjx3 = _mm_setzero_pd();
1504 fjy3 = _mm_setzero_pd();
1505 fjz3 = _mm_setzero_pd();
1507 /**************************
1508 * CALCULATE INTERACTIONS *
1509 **************************/
1511 if (gmx_mm_any_lt(rsq00,rcutoff2))
1514 r00 = _mm_mul_pd(rsq00,rinv00);
1516 /* LENNARD-JONES DISPERSION/REPULSION */
1518 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1519 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1520 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1521 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1522 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1524 d = _mm_sub_pd(r00,rswitch);
1525 d = _mm_max_pd(d,_mm_setzero_pd());
1526 d2 = _mm_mul_pd(d,d);
1527 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1529 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1531 /* Evaluate switch function */
1532 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1533 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1534 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1538 fscal = _mm_and_pd(fscal,cutoff_mask);
1540 /* Calculate temporary vectorial force */
1541 tx = _mm_mul_pd(fscal,dx00);
1542 ty = _mm_mul_pd(fscal,dy00);
1543 tz = _mm_mul_pd(fscal,dz00);
1545 /* Update vectorial force */
1546 fix0 = _mm_add_pd(fix0,tx);
1547 fiy0 = _mm_add_pd(fiy0,ty);
1548 fiz0 = _mm_add_pd(fiz0,tz);
1550 fjx0 = _mm_add_pd(fjx0,tx);
1551 fjy0 = _mm_add_pd(fjy0,ty);
1552 fjz0 = _mm_add_pd(fjz0,tz);
1556 /**************************
1557 * CALCULATE INTERACTIONS *
1558 **************************/
1560 if (gmx_mm_any_lt(rsq11,rcutoff2))
1563 /* REACTION-FIELD ELECTROSTATICS */
1564 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1566 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1570 fscal = _mm_and_pd(fscal,cutoff_mask);
1572 /* Calculate temporary vectorial force */
1573 tx = _mm_mul_pd(fscal,dx11);
1574 ty = _mm_mul_pd(fscal,dy11);
1575 tz = _mm_mul_pd(fscal,dz11);
1577 /* Update vectorial force */
1578 fix1 = _mm_add_pd(fix1,tx);
1579 fiy1 = _mm_add_pd(fiy1,ty);
1580 fiz1 = _mm_add_pd(fiz1,tz);
1582 fjx1 = _mm_add_pd(fjx1,tx);
1583 fjy1 = _mm_add_pd(fjy1,ty);
1584 fjz1 = _mm_add_pd(fjz1,tz);
1588 /**************************
1589 * CALCULATE INTERACTIONS *
1590 **************************/
1592 if (gmx_mm_any_lt(rsq12,rcutoff2))
1595 /* REACTION-FIELD ELECTROSTATICS */
1596 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1598 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1602 fscal = _mm_and_pd(fscal,cutoff_mask);
1604 /* Calculate temporary vectorial force */
1605 tx = _mm_mul_pd(fscal,dx12);
1606 ty = _mm_mul_pd(fscal,dy12);
1607 tz = _mm_mul_pd(fscal,dz12);
1609 /* Update vectorial force */
1610 fix1 = _mm_add_pd(fix1,tx);
1611 fiy1 = _mm_add_pd(fiy1,ty);
1612 fiz1 = _mm_add_pd(fiz1,tz);
1614 fjx2 = _mm_add_pd(fjx2,tx);
1615 fjy2 = _mm_add_pd(fjy2,ty);
1616 fjz2 = _mm_add_pd(fjz2,tz);
1620 /**************************
1621 * CALCULATE INTERACTIONS *
1622 **************************/
1624 if (gmx_mm_any_lt(rsq13,rcutoff2))
1627 /* REACTION-FIELD ELECTROSTATICS */
1628 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1630 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1634 fscal = _mm_and_pd(fscal,cutoff_mask);
1636 /* Calculate temporary vectorial force */
1637 tx = _mm_mul_pd(fscal,dx13);
1638 ty = _mm_mul_pd(fscal,dy13);
1639 tz = _mm_mul_pd(fscal,dz13);
1641 /* Update vectorial force */
1642 fix1 = _mm_add_pd(fix1,tx);
1643 fiy1 = _mm_add_pd(fiy1,ty);
1644 fiz1 = _mm_add_pd(fiz1,tz);
1646 fjx3 = _mm_add_pd(fjx3,tx);
1647 fjy3 = _mm_add_pd(fjy3,ty);
1648 fjz3 = _mm_add_pd(fjz3,tz);
1652 /**************************
1653 * CALCULATE INTERACTIONS *
1654 **************************/
1656 if (gmx_mm_any_lt(rsq21,rcutoff2))
1659 /* REACTION-FIELD ELECTROSTATICS */
1660 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1662 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1666 fscal = _mm_and_pd(fscal,cutoff_mask);
1668 /* Calculate temporary vectorial force */
1669 tx = _mm_mul_pd(fscal,dx21);
1670 ty = _mm_mul_pd(fscal,dy21);
1671 tz = _mm_mul_pd(fscal,dz21);
1673 /* Update vectorial force */
1674 fix2 = _mm_add_pd(fix2,tx);
1675 fiy2 = _mm_add_pd(fiy2,ty);
1676 fiz2 = _mm_add_pd(fiz2,tz);
1678 fjx1 = _mm_add_pd(fjx1,tx);
1679 fjy1 = _mm_add_pd(fjy1,ty);
1680 fjz1 = _mm_add_pd(fjz1,tz);
1684 /**************************
1685 * CALCULATE INTERACTIONS *
1686 **************************/
1688 if (gmx_mm_any_lt(rsq22,rcutoff2))
1691 /* REACTION-FIELD ELECTROSTATICS */
1692 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1694 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1698 fscal = _mm_and_pd(fscal,cutoff_mask);
1700 /* Calculate temporary vectorial force */
1701 tx = _mm_mul_pd(fscal,dx22);
1702 ty = _mm_mul_pd(fscal,dy22);
1703 tz = _mm_mul_pd(fscal,dz22);
1705 /* Update vectorial force */
1706 fix2 = _mm_add_pd(fix2,tx);
1707 fiy2 = _mm_add_pd(fiy2,ty);
1708 fiz2 = _mm_add_pd(fiz2,tz);
1710 fjx2 = _mm_add_pd(fjx2,tx);
1711 fjy2 = _mm_add_pd(fjy2,ty);
1712 fjz2 = _mm_add_pd(fjz2,tz);
1716 /**************************
1717 * CALCULATE INTERACTIONS *
1718 **************************/
1720 if (gmx_mm_any_lt(rsq23,rcutoff2))
1723 /* REACTION-FIELD ELECTROSTATICS */
1724 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1726 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1730 fscal = _mm_and_pd(fscal,cutoff_mask);
1732 /* Calculate temporary vectorial force */
1733 tx = _mm_mul_pd(fscal,dx23);
1734 ty = _mm_mul_pd(fscal,dy23);
1735 tz = _mm_mul_pd(fscal,dz23);
1737 /* Update vectorial force */
1738 fix2 = _mm_add_pd(fix2,tx);
1739 fiy2 = _mm_add_pd(fiy2,ty);
1740 fiz2 = _mm_add_pd(fiz2,tz);
1742 fjx3 = _mm_add_pd(fjx3,tx);
1743 fjy3 = _mm_add_pd(fjy3,ty);
1744 fjz3 = _mm_add_pd(fjz3,tz);
1748 /**************************
1749 * CALCULATE INTERACTIONS *
1750 **************************/
1752 if (gmx_mm_any_lt(rsq31,rcutoff2))
1755 /* REACTION-FIELD ELECTROSTATICS */
1756 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1758 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1762 fscal = _mm_and_pd(fscal,cutoff_mask);
1764 /* Calculate temporary vectorial force */
1765 tx = _mm_mul_pd(fscal,dx31);
1766 ty = _mm_mul_pd(fscal,dy31);
1767 tz = _mm_mul_pd(fscal,dz31);
1769 /* Update vectorial force */
1770 fix3 = _mm_add_pd(fix3,tx);
1771 fiy3 = _mm_add_pd(fiy3,ty);
1772 fiz3 = _mm_add_pd(fiz3,tz);
1774 fjx1 = _mm_add_pd(fjx1,tx);
1775 fjy1 = _mm_add_pd(fjy1,ty);
1776 fjz1 = _mm_add_pd(fjz1,tz);
1780 /**************************
1781 * CALCULATE INTERACTIONS *
1782 **************************/
1784 if (gmx_mm_any_lt(rsq32,rcutoff2))
1787 /* REACTION-FIELD ELECTROSTATICS */
1788 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1790 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1794 fscal = _mm_and_pd(fscal,cutoff_mask);
1796 /* Calculate temporary vectorial force */
1797 tx = _mm_mul_pd(fscal,dx32);
1798 ty = _mm_mul_pd(fscal,dy32);
1799 tz = _mm_mul_pd(fscal,dz32);
1801 /* Update vectorial force */
1802 fix3 = _mm_add_pd(fix3,tx);
1803 fiy3 = _mm_add_pd(fiy3,ty);
1804 fiz3 = _mm_add_pd(fiz3,tz);
1806 fjx2 = _mm_add_pd(fjx2,tx);
1807 fjy2 = _mm_add_pd(fjy2,ty);
1808 fjz2 = _mm_add_pd(fjz2,tz);
1812 /**************************
1813 * CALCULATE INTERACTIONS *
1814 **************************/
1816 if (gmx_mm_any_lt(rsq33,rcutoff2))
1819 /* REACTION-FIELD ELECTROSTATICS */
1820 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1822 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1826 fscal = _mm_and_pd(fscal,cutoff_mask);
1828 /* Calculate temporary vectorial force */
1829 tx = _mm_mul_pd(fscal,dx33);
1830 ty = _mm_mul_pd(fscal,dy33);
1831 tz = _mm_mul_pd(fscal,dz33);
1833 /* Update vectorial force */
1834 fix3 = _mm_add_pd(fix3,tx);
1835 fiy3 = _mm_add_pd(fiy3,ty);
1836 fiz3 = _mm_add_pd(fiz3,tz);
1838 fjx3 = _mm_add_pd(fjx3,tx);
1839 fjy3 = _mm_add_pd(fjy3,ty);
1840 fjz3 = _mm_add_pd(fjz3,tz);
1844 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1846 /* Inner loop uses 329 flops */
1849 if(jidx<j_index_end)
1853 j_coord_offsetA = DIM*jnrA;
1855 /* load j atom coordinates */
1856 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1857 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1858 &jy2,&jz2,&jx3,&jy3,&jz3);
1860 /* Calculate displacement vector */
1861 dx00 = _mm_sub_pd(ix0,jx0);
1862 dy00 = _mm_sub_pd(iy0,jy0);
1863 dz00 = _mm_sub_pd(iz0,jz0);
1864 dx11 = _mm_sub_pd(ix1,jx1);
1865 dy11 = _mm_sub_pd(iy1,jy1);
1866 dz11 = _mm_sub_pd(iz1,jz1);
1867 dx12 = _mm_sub_pd(ix1,jx2);
1868 dy12 = _mm_sub_pd(iy1,jy2);
1869 dz12 = _mm_sub_pd(iz1,jz2);
1870 dx13 = _mm_sub_pd(ix1,jx3);
1871 dy13 = _mm_sub_pd(iy1,jy3);
1872 dz13 = _mm_sub_pd(iz1,jz3);
1873 dx21 = _mm_sub_pd(ix2,jx1);
1874 dy21 = _mm_sub_pd(iy2,jy1);
1875 dz21 = _mm_sub_pd(iz2,jz1);
1876 dx22 = _mm_sub_pd(ix2,jx2);
1877 dy22 = _mm_sub_pd(iy2,jy2);
1878 dz22 = _mm_sub_pd(iz2,jz2);
1879 dx23 = _mm_sub_pd(ix2,jx3);
1880 dy23 = _mm_sub_pd(iy2,jy3);
1881 dz23 = _mm_sub_pd(iz2,jz3);
1882 dx31 = _mm_sub_pd(ix3,jx1);
1883 dy31 = _mm_sub_pd(iy3,jy1);
1884 dz31 = _mm_sub_pd(iz3,jz1);
1885 dx32 = _mm_sub_pd(ix3,jx2);
1886 dy32 = _mm_sub_pd(iy3,jy2);
1887 dz32 = _mm_sub_pd(iz3,jz2);
1888 dx33 = _mm_sub_pd(ix3,jx3);
1889 dy33 = _mm_sub_pd(iy3,jy3);
1890 dz33 = _mm_sub_pd(iz3,jz3);
1892 /* Calculate squared distance and things based on it */
1893 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1894 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1895 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1896 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1897 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1898 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1899 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1900 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1901 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1902 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1904 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1905 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1906 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1907 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1908 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1909 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1910 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1911 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1912 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1913 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1915 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1916 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1917 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1918 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1919 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1920 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1921 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1922 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1923 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1924 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1926 fjx0 = _mm_setzero_pd();
1927 fjy0 = _mm_setzero_pd();
1928 fjz0 = _mm_setzero_pd();
1929 fjx1 = _mm_setzero_pd();
1930 fjy1 = _mm_setzero_pd();
1931 fjz1 = _mm_setzero_pd();
1932 fjx2 = _mm_setzero_pd();
1933 fjy2 = _mm_setzero_pd();
1934 fjz2 = _mm_setzero_pd();
1935 fjx3 = _mm_setzero_pd();
1936 fjy3 = _mm_setzero_pd();
1937 fjz3 = _mm_setzero_pd();
1939 /**************************
1940 * CALCULATE INTERACTIONS *
1941 **************************/
1943 if (gmx_mm_any_lt(rsq00,rcutoff2))
1946 r00 = _mm_mul_pd(rsq00,rinv00);
1948 /* LENNARD-JONES DISPERSION/REPULSION */
1950 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1951 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1952 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1953 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1954 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1956 d = _mm_sub_pd(r00,rswitch);
1957 d = _mm_max_pd(d,_mm_setzero_pd());
1958 d2 = _mm_mul_pd(d,d);
1959 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1961 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1963 /* Evaluate switch function */
1964 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1965 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1966 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1970 fscal = _mm_and_pd(fscal,cutoff_mask);
1972 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1974 /* Calculate temporary vectorial force */
1975 tx = _mm_mul_pd(fscal,dx00);
1976 ty = _mm_mul_pd(fscal,dy00);
1977 tz = _mm_mul_pd(fscal,dz00);
1979 /* Update vectorial force */
1980 fix0 = _mm_add_pd(fix0,tx);
1981 fiy0 = _mm_add_pd(fiy0,ty);
1982 fiz0 = _mm_add_pd(fiz0,tz);
1984 fjx0 = _mm_add_pd(fjx0,tx);
1985 fjy0 = _mm_add_pd(fjy0,ty);
1986 fjz0 = _mm_add_pd(fjz0,tz);
1990 /**************************
1991 * CALCULATE INTERACTIONS *
1992 **************************/
1994 if (gmx_mm_any_lt(rsq11,rcutoff2))
1997 /* REACTION-FIELD ELECTROSTATICS */
1998 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
2000 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2004 fscal = _mm_and_pd(fscal,cutoff_mask);
2006 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2008 /* Calculate temporary vectorial force */
2009 tx = _mm_mul_pd(fscal,dx11);
2010 ty = _mm_mul_pd(fscal,dy11);
2011 tz = _mm_mul_pd(fscal,dz11);
2013 /* Update vectorial force */
2014 fix1 = _mm_add_pd(fix1,tx);
2015 fiy1 = _mm_add_pd(fiy1,ty);
2016 fiz1 = _mm_add_pd(fiz1,tz);
2018 fjx1 = _mm_add_pd(fjx1,tx);
2019 fjy1 = _mm_add_pd(fjy1,ty);
2020 fjz1 = _mm_add_pd(fjz1,tz);
2024 /**************************
2025 * CALCULATE INTERACTIONS *
2026 **************************/
2028 if (gmx_mm_any_lt(rsq12,rcutoff2))
2031 /* REACTION-FIELD ELECTROSTATICS */
2032 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
2034 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2038 fscal = _mm_and_pd(fscal,cutoff_mask);
2040 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2042 /* Calculate temporary vectorial force */
2043 tx = _mm_mul_pd(fscal,dx12);
2044 ty = _mm_mul_pd(fscal,dy12);
2045 tz = _mm_mul_pd(fscal,dz12);
2047 /* Update vectorial force */
2048 fix1 = _mm_add_pd(fix1,tx);
2049 fiy1 = _mm_add_pd(fiy1,ty);
2050 fiz1 = _mm_add_pd(fiz1,tz);
2052 fjx2 = _mm_add_pd(fjx2,tx);
2053 fjy2 = _mm_add_pd(fjy2,ty);
2054 fjz2 = _mm_add_pd(fjz2,tz);
2058 /**************************
2059 * CALCULATE INTERACTIONS *
2060 **************************/
2062 if (gmx_mm_any_lt(rsq13,rcutoff2))
2065 /* REACTION-FIELD ELECTROSTATICS */
2066 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
2068 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2072 fscal = _mm_and_pd(fscal,cutoff_mask);
2074 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2076 /* Calculate temporary vectorial force */
2077 tx = _mm_mul_pd(fscal,dx13);
2078 ty = _mm_mul_pd(fscal,dy13);
2079 tz = _mm_mul_pd(fscal,dz13);
2081 /* Update vectorial force */
2082 fix1 = _mm_add_pd(fix1,tx);
2083 fiy1 = _mm_add_pd(fiy1,ty);
2084 fiz1 = _mm_add_pd(fiz1,tz);
2086 fjx3 = _mm_add_pd(fjx3,tx);
2087 fjy3 = _mm_add_pd(fjy3,ty);
2088 fjz3 = _mm_add_pd(fjz3,tz);
2092 /**************************
2093 * CALCULATE INTERACTIONS *
2094 **************************/
2096 if (gmx_mm_any_lt(rsq21,rcutoff2))
2099 /* REACTION-FIELD ELECTROSTATICS */
2100 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
2102 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2106 fscal = _mm_and_pd(fscal,cutoff_mask);
2108 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2110 /* Calculate temporary vectorial force */
2111 tx = _mm_mul_pd(fscal,dx21);
2112 ty = _mm_mul_pd(fscal,dy21);
2113 tz = _mm_mul_pd(fscal,dz21);
2115 /* Update vectorial force */
2116 fix2 = _mm_add_pd(fix2,tx);
2117 fiy2 = _mm_add_pd(fiy2,ty);
2118 fiz2 = _mm_add_pd(fiz2,tz);
2120 fjx1 = _mm_add_pd(fjx1,tx);
2121 fjy1 = _mm_add_pd(fjy1,ty);
2122 fjz1 = _mm_add_pd(fjz1,tz);
2126 /**************************
2127 * CALCULATE INTERACTIONS *
2128 **************************/
2130 if (gmx_mm_any_lt(rsq22,rcutoff2))
2133 /* REACTION-FIELD ELECTROSTATICS */
2134 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2136 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2140 fscal = _mm_and_pd(fscal,cutoff_mask);
2142 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2144 /* Calculate temporary vectorial force */
2145 tx = _mm_mul_pd(fscal,dx22);
2146 ty = _mm_mul_pd(fscal,dy22);
2147 tz = _mm_mul_pd(fscal,dz22);
2149 /* Update vectorial force */
2150 fix2 = _mm_add_pd(fix2,tx);
2151 fiy2 = _mm_add_pd(fiy2,ty);
2152 fiz2 = _mm_add_pd(fiz2,tz);
2154 fjx2 = _mm_add_pd(fjx2,tx);
2155 fjy2 = _mm_add_pd(fjy2,ty);
2156 fjz2 = _mm_add_pd(fjz2,tz);
2160 /**************************
2161 * CALCULATE INTERACTIONS *
2162 **************************/
2164 if (gmx_mm_any_lt(rsq23,rcutoff2))
2167 /* REACTION-FIELD ELECTROSTATICS */
2168 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
2170 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2174 fscal = _mm_and_pd(fscal,cutoff_mask);
2176 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2178 /* Calculate temporary vectorial force */
2179 tx = _mm_mul_pd(fscal,dx23);
2180 ty = _mm_mul_pd(fscal,dy23);
2181 tz = _mm_mul_pd(fscal,dz23);
2183 /* Update vectorial force */
2184 fix2 = _mm_add_pd(fix2,tx);
2185 fiy2 = _mm_add_pd(fiy2,ty);
2186 fiz2 = _mm_add_pd(fiz2,tz);
2188 fjx3 = _mm_add_pd(fjx3,tx);
2189 fjy3 = _mm_add_pd(fjy3,ty);
2190 fjz3 = _mm_add_pd(fjz3,tz);
2194 /**************************
2195 * CALCULATE INTERACTIONS *
2196 **************************/
2198 if (gmx_mm_any_lt(rsq31,rcutoff2))
2201 /* REACTION-FIELD ELECTROSTATICS */
2202 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
2204 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2208 fscal = _mm_and_pd(fscal,cutoff_mask);
2210 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2212 /* Calculate temporary vectorial force */
2213 tx = _mm_mul_pd(fscal,dx31);
2214 ty = _mm_mul_pd(fscal,dy31);
2215 tz = _mm_mul_pd(fscal,dz31);
2217 /* Update vectorial force */
2218 fix3 = _mm_add_pd(fix3,tx);
2219 fiy3 = _mm_add_pd(fiy3,ty);
2220 fiz3 = _mm_add_pd(fiz3,tz);
2222 fjx1 = _mm_add_pd(fjx1,tx);
2223 fjy1 = _mm_add_pd(fjy1,ty);
2224 fjz1 = _mm_add_pd(fjz1,tz);
2228 /**************************
2229 * CALCULATE INTERACTIONS *
2230 **************************/
2232 if (gmx_mm_any_lt(rsq32,rcutoff2))
2235 /* REACTION-FIELD ELECTROSTATICS */
2236 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
2238 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2242 fscal = _mm_and_pd(fscal,cutoff_mask);
2244 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2246 /* Calculate temporary vectorial force */
2247 tx = _mm_mul_pd(fscal,dx32);
2248 ty = _mm_mul_pd(fscal,dy32);
2249 tz = _mm_mul_pd(fscal,dz32);
2251 /* Update vectorial force */
2252 fix3 = _mm_add_pd(fix3,tx);
2253 fiy3 = _mm_add_pd(fiy3,ty);
2254 fiz3 = _mm_add_pd(fiz3,tz);
2256 fjx2 = _mm_add_pd(fjx2,tx);
2257 fjy2 = _mm_add_pd(fjy2,ty);
2258 fjz2 = _mm_add_pd(fjz2,tz);
2262 /**************************
2263 * CALCULATE INTERACTIONS *
2264 **************************/
2266 if (gmx_mm_any_lt(rsq33,rcutoff2))
2269 /* REACTION-FIELD ELECTROSTATICS */
2270 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
2272 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2276 fscal = _mm_and_pd(fscal,cutoff_mask);
2278 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2280 /* Calculate temporary vectorial force */
2281 tx = _mm_mul_pd(fscal,dx33);
2282 ty = _mm_mul_pd(fscal,dy33);
2283 tz = _mm_mul_pd(fscal,dz33);
2285 /* Update vectorial force */
2286 fix3 = _mm_add_pd(fix3,tx);
2287 fiy3 = _mm_add_pd(fiy3,ty);
2288 fiz3 = _mm_add_pd(fiz3,tz);
2290 fjx3 = _mm_add_pd(fjx3,tx);
2291 fjy3 = _mm_add_pd(fjy3,ty);
2292 fjz3 = _mm_add_pd(fjz3,tz);
2296 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2298 /* Inner loop uses 329 flops */
2301 /* End of innermost loop */
2303 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2304 f+i_coord_offset,fshift+i_shift_offset);
2306 /* Increment number of inner iterations */
2307 inneriter += j_index_end - j_index_start;
2309 /* Outer loop uses 24 flops */
2312 /* Increment number of outer iterations */
2315 /* Update outer/inner flops */
2317 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*329);