2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_sse2_double.h"
50 #include "kernelutil_x86_sse2_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
110 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
112 __m128i ifour = _mm_set1_epi32(4);
113 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
115 __m128d dummy_mask,cutoff_mask;
116 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117 __m128d one = _mm_set1_pd(1.0);
118 __m128d two = _mm_set1_pd(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_pd(fr->epsfac);
131 charge = mdatoms->chargeA;
132 krf = _mm_set1_pd(fr->ic->k_rf);
133 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
134 crf = _mm_set1_pd(fr->ic->c_rf);
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 vftab = kernel_data->table_vdw->data;
140 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
145 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
146 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
147 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
149 jq0 = _mm_set1_pd(charge[inr+0]);
150 jq1 = _mm_set1_pd(charge[inr+1]);
151 jq2 = _mm_set1_pd(charge[inr+2]);
152 vdwjidx0A = 2*vdwtype[inr+0];
153 qq00 = _mm_mul_pd(iq0,jq0);
154 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
155 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
156 qq01 = _mm_mul_pd(iq0,jq1);
157 qq02 = _mm_mul_pd(iq0,jq2);
158 qq10 = _mm_mul_pd(iq1,jq0);
159 qq11 = _mm_mul_pd(iq1,jq1);
160 qq12 = _mm_mul_pd(iq1,jq2);
161 qq20 = _mm_mul_pd(iq2,jq0);
162 qq21 = _mm_mul_pd(iq2,jq1);
163 qq22 = _mm_mul_pd(iq2,jq2);
165 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166 rcutoff_scalar = fr->rcoulomb;
167 rcutoff = _mm_set1_pd(rcutoff_scalar);
168 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
170 /* Avoid stupid compiler warnings */
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm_setzero_pd();
197 fiy0 = _mm_setzero_pd();
198 fiz0 = _mm_setzero_pd();
199 fix1 = _mm_setzero_pd();
200 fiy1 = _mm_setzero_pd();
201 fiz1 = _mm_setzero_pd();
202 fix2 = _mm_setzero_pd();
203 fiy2 = _mm_setzero_pd();
204 fiz2 = _mm_setzero_pd();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_pd();
208 vvdwsum = _mm_setzero_pd();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
214 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
220 /* load j atom coordinates */
221 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
222 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
224 /* Calculate displacement vector */
225 dx00 = _mm_sub_pd(ix0,jx0);
226 dy00 = _mm_sub_pd(iy0,jy0);
227 dz00 = _mm_sub_pd(iz0,jz0);
228 dx01 = _mm_sub_pd(ix0,jx1);
229 dy01 = _mm_sub_pd(iy0,jy1);
230 dz01 = _mm_sub_pd(iz0,jz1);
231 dx02 = _mm_sub_pd(ix0,jx2);
232 dy02 = _mm_sub_pd(iy0,jy2);
233 dz02 = _mm_sub_pd(iz0,jz2);
234 dx10 = _mm_sub_pd(ix1,jx0);
235 dy10 = _mm_sub_pd(iy1,jy0);
236 dz10 = _mm_sub_pd(iz1,jz0);
237 dx11 = _mm_sub_pd(ix1,jx1);
238 dy11 = _mm_sub_pd(iy1,jy1);
239 dz11 = _mm_sub_pd(iz1,jz1);
240 dx12 = _mm_sub_pd(ix1,jx2);
241 dy12 = _mm_sub_pd(iy1,jy2);
242 dz12 = _mm_sub_pd(iz1,jz2);
243 dx20 = _mm_sub_pd(ix2,jx0);
244 dy20 = _mm_sub_pd(iy2,jy0);
245 dz20 = _mm_sub_pd(iz2,jz0);
246 dx21 = _mm_sub_pd(ix2,jx1);
247 dy21 = _mm_sub_pd(iy2,jy1);
248 dz21 = _mm_sub_pd(iz2,jz1);
249 dx22 = _mm_sub_pd(ix2,jx2);
250 dy22 = _mm_sub_pd(iy2,jy2);
251 dz22 = _mm_sub_pd(iz2,jz2);
253 /* Calculate squared distance and things based on it */
254 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
255 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
256 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
257 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
258 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
259 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
260 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
261 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
262 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
264 rinv00 = gmx_mm_invsqrt_pd(rsq00);
265 rinv01 = gmx_mm_invsqrt_pd(rsq01);
266 rinv02 = gmx_mm_invsqrt_pd(rsq02);
267 rinv10 = gmx_mm_invsqrt_pd(rsq10);
268 rinv11 = gmx_mm_invsqrt_pd(rsq11);
269 rinv12 = gmx_mm_invsqrt_pd(rsq12);
270 rinv20 = gmx_mm_invsqrt_pd(rsq20);
271 rinv21 = gmx_mm_invsqrt_pd(rsq21);
272 rinv22 = gmx_mm_invsqrt_pd(rsq22);
274 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
275 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
276 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
277 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
278 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
279 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
280 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
281 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
282 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
284 fjx0 = _mm_setzero_pd();
285 fjy0 = _mm_setzero_pd();
286 fjz0 = _mm_setzero_pd();
287 fjx1 = _mm_setzero_pd();
288 fjy1 = _mm_setzero_pd();
289 fjz1 = _mm_setzero_pd();
290 fjx2 = _mm_setzero_pd();
291 fjy2 = _mm_setzero_pd();
292 fjz2 = _mm_setzero_pd();
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 if (gmx_mm_any_lt(rsq00,rcutoff2))
301 r00 = _mm_mul_pd(rsq00,rinv00);
303 /* Calculate table index by multiplying r with table scale and truncate to integer */
304 rt = _mm_mul_pd(r00,vftabscale);
305 vfitab = _mm_cvttpd_epi32(rt);
306 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
307 vfitab = _mm_slli_epi32(vfitab,3);
309 /* REACTION-FIELD ELECTROSTATICS */
310 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
311 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
313 /* CUBIC SPLINE TABLE DISPERSION */
314 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
315 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
316 GMX_MM_TRANSPOSE2_PD(Y,F);
317 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
318 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
319 GMX_MM_TRANSPOSE2_PD(G,H);
320 Heps = _mm_mul_pd(vfeps,H);
321 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
322 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
323 vvdw6 = _mm_mul_pd(c6_00,VV);
324 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
325 fvdw6 = _mm_mul_pd(c6_00,FF);
327 /* CUBIC SPLINE TABLE REPULSION */
328 vfitab = _mm_add_epi32(vfitab,ifour);
329 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
330 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
331 GMX_MM_TRANSPOSE2_PD(Y,F);
332 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
333 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
334 GMX_MM_TRANSPOSE2_PD(G,H);
335 Heps = _mm_mul_pd(vfeps,H);
336 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
337 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
338 vvdw12 = _mm_mul_pd(c12_00,VV);
339 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
340 fvdw12 = _mm_mul_pd(c12_00,FF);
341 vvdw = _mm_add_pd(vvdw12,vvdw6);
342 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
344 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 velec = _mm_and_pd(velec,cutoff_mask);
348 velecsum = _mm_add_pd(velecsum,velec);
349 vvdw = _mm_and_pd(vvdw,cutoff_mask);
350 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
352 fscal = _mm_add_pd(felec,fvdw);
354 fscal = _mm_and_pd(fscal,cutoff_mask);
356 /* Calculate temporary vectorial force */
357 tx = _mm_mul_pd(fscal,dx00);
358 ty = _mm_mul_pd(fscal,dy00);
359 tz = _mm_mul_pd(fscal,dz00);
361 /* Update vectorial force */
362 fix0 = _mm_add_pd(fix0,tx);
363 fiy0 = _mm_add_pd(fiy0,ty);
364 fiz0 = _mm_add_pd(fiz0,tz);
366 fjx0 = _mm_add_pd(fjx0,tx);
367 fjy0 = _mm_add_pd(fjy0,ty);
368 fjz0 = _mm_add_pd(fjz0,tz);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 if (gmx_mm_any_lt(rsq01,rcutoff2))
379 /* REACTION-FIELD ELECTROSTATICS */
380 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
381 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
383 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velec = _mm_and_pd(velec,cutoff_mask);
387 velecsum = _mm_add_pd(velecsum,velec);
391 fscal = _mm_and_pd(fscal,cutoff_mask);
393 /* Calculate temporary vectorial force */
394 tx = _mm_mul_pd(fscal,dx01);
395 ty = _mm_mul_pd(fscal,dy01);
396 tz = _mm_mul_pd(fscal,dz01);
398 /* Update vectorial force */
399 fix0 = _mm_add_pd(fix0,tx);
400 fiy0 = _mm_add_pd(fiy0,ty);
401 fiz0 = _mm_add_pd(fiz0,tz);
403 fjx1 = _mm_add_pd(fjx1,tx);
404 fjy1 = _mm_add_pd(fjy1,ty);
405 fjz1 = _mm_add_pd(fjz1,tz);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 if (gmx_mm_any_lt(rsq02,rcutoff2))
416 /* REACTION-FIELD ELECTROSTATICS */
417 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
418 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
420 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velec = _mm_and_pd(velec,cutoff_mask);
424 velecsum = _mm_add_pd(velecsum,velec);
428 fscal = _mm_and_pd(fscal,cutoff_mask);
430 /* Calculate temporary vectorial force */
431 tx = _mm_mul_pd(fscal,dx02);
432 ty = _mm_mul_pd(fscal,dy02);
433 tz = _mm_mul_pd(fscal,dz02);
435 /* Update vectorial force */
436 fix0 = _mm_add_pd(fix0,tx);
437 fiy0 = _mm_add_pd(fiy0,ty);
438 fiz0 = _mm_add_pd(fiz0,tz);
440 fjx2 = _mm_add_pd(fjx2,tx);
441 fjy2 = _mm_add_pd(fjy2,ty);
442 fjz2 = _mm_add_pd(fjz2,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 if (gmx_mm_any_lt(rsq10,rcutoff2))
453 /* REACTION-FIELD ELECTROSTATICS */
454 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
455 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
457 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
459 /* Update potential sum for this i atom from the interaction with this j atom. */
460 velec = _mm_and_pd(velec,cutoff_mask);
461 velecsum = _mm_add_pd(velecsum,velec);
465 fscal = _mm_and_pd(fscal,cutoff_mask);
467 /* Calculate temporary vectorial force */
468 tx = _mm_mul_pd(fscal,dx10);
469 ty = _mm_mul_pd(fscal,dy10);
470 tz = _mm_mul_pd(fscal,dz10);
472 /* Update vectorial force */
473 fix1 = _mm_add_pd(fix1,tx);
474 fiy1 = _mm_add_pd(fiy1,ty);
475 fiz1 = _mm_add_pd(fiz1,tz);
477 fjx0 = _mm_add_pd(fjx0,tx);
478 fjy0 = _mm_add_pd(fjy0,ty);
479 fjz0 = _mm_add_pd(fjz0,tz);
483 /**************************
484 * CALCULATE INTERACTIONS *
485 **************************/
487 if (gmx_mm_any_lt(rsq11,rcutoff2))
490 /* REACTION-FIELD ELECTROSTATICS */
491 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
492 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
494 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec = _mm_and_pd(velec,cutoff_mask);
498 velecsum = _mm_add_pd(velecsum,velec);
502 fscal = _mm_and_pd(fscal,cutoff_mask);
504 /* Calculate temporary vectorial force */
505 tx = _mm_mul_pd(fscal,dx11);
506 ty = _mm_mul_pd(fscal,dy11);
507 tz = _mm_mul_pd(fscal,dz11);
509 /* Update vectorial force */
510 fix1 = _mm_add_pd(fix1,tx);
511 fiy1 = _mm_add_pd(fiy1,ty);
512 fiz1 = _mm_add_pd(fiz1,tz);
514 fjx1 = _mm_add_pd(fjx1,tx);
515 fjy1 = _mm_add_pd(fjy1,ty);
516 fjz1 = _mm_add_pd(fjz1,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 if (gmx_mm_any_lt(rsq12,rcutoff2))
527 /* REACTION-FIELD ELECTROSTATICS */
528 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
529 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
531 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
533 /* Update potential sum for this i atom from the interaction with this j atom. */
534 velec = _mm_and_pd(velec,cutoff_mask);
535 velecsum = _mm_add_pd(velecsum,velec);
539 fscal = _mm_and_pd(fscal,cutoff_mask);
541 /* Calculate temporary vectorial force */
542 tx = _mm_mul_pd(fscal,dx12);
543 ty = _mm_mul_pd(fscal,dy12);
544 tz = _mm_mul_pd(fscal,dz12);
546 /* Update vectorial force */
547 fix1 = _mm_add_pd(fix1,tx);
548 fiy1 = _mm_add_pd(fiy1,ty);
549 fiz1 = _mm_add_pd(fiz1,tz);
551 fjx2 = _mm_add_pd(fjx2,tx);
552 fjy2 = _mm_add_pd(fjy2,ty);
553 fjz2 = _mm_add_pd(fjz2,tz);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 if (gmx_mm_any_lt(rsq20,rcutoff2))
564 /* REACTION-FIELD ELECTROSTATICS */
565 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
566 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
568 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velec = _mm_and_pd(velec,cutoff_mask);
572 velecsum = _mm_add_pd(velecsum,velec);
576 fscal = _mm_and_pd(fscal,cutoff_mask);
578 /* Calculate temporary vectorial force */
579 tx = _mm_mul_pd(fscal,dx20);
580 ty = _mm_mul_pd(fscal,dy20);
581 tz = _mm_mul_pd(fscal,dz20);
583 /* Update vectorial force */
584 fix2 = _mm_add_pd(fix2,tx);
585 fiy2 = _mm_add_pd(fiy2,ty);
586 fiz2 = _mm_add_pd(fiz2,tz);
588 fjx0 = _mm_add_pd(fjx0,tx);
589 fjy0 = _mm_add_pd(fjy0,ty);
590 fjz0 = _mm_add_pd(fjz0,tz);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 if (gmx_mm_any_lt(rsq21,rcutoff2))
601 /* REACTION-FIELD ELECTROSTATICS */
602 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
603 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
605 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
607 /* Update potential sum for this i atom from the interaction with this j atom. */
608 velec = _mm_and_pd(velec,cutoff_mask);
609 velecsum = _mm_add_pd(velecsum,velec);
613 fscal = _mm_and_pd(fscal,cutoff_mask);
615 /* Calculate temporary vectorial force */
616 tx = _mm_mul_pd(fscal,dx21);
617 ty = _mm_mul_pd(fscal,dy21);
618 tz = _mm_mul_pd(fscal,dz21);
620 /* Update vectorial force */
621 fix2 = _mm_add_pd(fix2,tx);
622 fiy2 = _mm_add_pd(fiy2,ty);
623 fiz2 = _mm_add_pd(fiz2,tz);
625 fjx1 = _mm_add_pd(fjx1,tx);
626 fjy1 = _mm_add_pd(fjy1,ty);
627 fjz1 = _mm_add_pd(fjz1,tz);
631 /**************************
632 * CALCULATE INTERACTIONS *
633 **************************/
635 if (gmx_mm_any_lt(rsq22,rcutoff2))
638 /* REACTION-FIELD ELECTROSTATICS */
639 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
640 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
642 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velec = _mm_and_pd(velec,cutoff_mask);
646 velecsum = _mm_add_pd(velecsum,velec);
650 fscal = _mm_and_pd(fscal,cutoff_mask);
652 /* Calculate temporary vectorial force */
653 tx = _mm_mul_pd(fscal,dx22);
654 ty = _mm_mul_pd(fscal,dy22);
655 tz = _mm_mul_pd(fscal,dz22);
657 /* Update vectorial force */
658 fix2 = _mm_add_pd(fix2,tx);
659 fiy2 = _mm_add_pd(fiy2,ty);
660 fiz2 = _mm_add_pd(fiz2,tz);
662 fjx2 = _mm_add_pd(fjx2,tx);
663 fjy2 = _mm_add_pd(fjy2,ty);
664 fjz2 = _mm_add_pd(fjz2,tz);
668 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
670 /* Inner loop uses 360 flops */
677 j_coord_offsetA = DIM*jnrA;
679 /* load j atom coordinates */
680 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
681 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
683 /* Calculate displacement vector */
684 dx00 = _mm_sub_pd(ix0,jx0);
685 dy00 = _mm_sub_pd(iy0,jy0);
686 dz00 = _mm_sub_pd(iz0,jz0);
687 dx01 = _mm_sub_pd(ix0,jx1);
688 dy01 = _mm_sub_pd(iy0,jy1);
689 dz01 = _mm_sub_pd(iz0,jz1);
690 dx02 = _mm_sub_pd(ix0,jx2);
691 dy02 = _mm_sub_pd(iy0,jy2);
692 dz02 = _mm_sub_pd(iz0,jz2);
693 dx10 = _mm_sub_pd(ix1,jx0);
694 dy10 = _mm_sub_pd(iy1,jy0);
695 dz10 = _mm_sub_pd(iz1,jz0);
696 dx11 = _mm_sub_pd(ix1,jx1);
697 dy11 = _mm_sub_pd(iy1,jy1);
698 dz11 = _mm_sub_pd(iz1,jz1);
699 dx12 = _mm_sub_pd(ix1,jx2);
700 dy12 = _mm_sub_pd(iy1,jy2);
701 dz12 = _mm_sub_pd(iz1,jz2);
702 dx20 = _mm_sub_pd(ix2,jx0);
703 dy20 = _mm_sub_pd(iy2,jy0);
704 dz20 = _mm_sub_pd(iz2,jz0);
705 dx21 = _mm_sub_pd(ix2,jx1);
706 dy21 = _mm_sub_pd(iy2,jy1);
707 dz21 = _mm_sub_pd(iz2,jz1);
708 dx22 = _mm_sub_pd(ix2,jx2);
709 dy22 = _mm_sub_pd(iy2,jy2);
710 dz22 = _mm_sub_pd(iz2,jz2);
712 /* Calculate squared distance and things based on it */
713 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
714 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
715 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
716 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
717 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
718 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
719 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
720 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
721 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
723 rinv00 = gmx_mm_invsqrt_pd(rsq00);
724 rinv01 = gmx_mm_invsqrt_pd(rsq01);
725 rinv02 = gmx_mm_invsqrt_pd(rsq02);
726 rinv10 = gmx_mm_invsqrt_pd(rsq10);
727 rinv11 = gmx_mm_invsqrt_pd(rsq11);
728 rinv12 = gmx_mm_invsqrt_pd(rsq12);
729 rinv20 = gmx_mm_invsqrt_pd(rsq20);
730 rinv21 = gmx_mm_invsqrt_pd(rsq21);
731 rinv22 = gmx_mm_invsqrt_pd(rsq22);
733 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
734 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
735 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
736 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
737 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
738 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
739 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
740 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
741 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
743 fjx0 = _mm_setzero_pd();
744 fjy0 = _mm_setzero_pd();
745 fjz0 = _mm_setzero_pd();
746 fjx1 = _mm_setzero_pd();
747 fjy1 = _mm_setzero_pd();
748 fjz1 = _mm_setzero_pd();
749 fjx2 = _mm_setzero_pd();
750 fjy2 = _mm_setzero_pd();
751 fjz2 = _mm_setzero_pd();
753 /**************************
754 * CALCULATE INTERACTIONS *
755 **************************/
757 if (gmx_mm_any_lt(rsq00,rcutoff2))
760 r00 = _mm_mul_pd(rsq00,rinv00);
762 /* Calculate table index by multiplying r with table scale and truncate to integer */
763 rt = _mm_mul_pd(r00,vftabscale);
764 vfitab = _mm_cvttpd_epi32(rt);
765 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
766 vfitab = _mm_slli_epi32(vfitab,3);
768 /* REACTION-FIELD ELECTROSTATICS */
769 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
770 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
772 /* CUBIC SPLINE TABLE DISPERSION */
773 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
774 F = _mm_setzero_pd();
775 GMX_MM_TRANSPOSE2_PD(Y,F);
776 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
777 H = _mm_setzero_pd();
778 GMX_MM_TRANSPOSE2_PD(G,H);
779 Heps = _mm_mul_pd(vfeps,H);
780 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
781 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
782 vvdw6 = _mm_mul_pd(c6_00,VV);
783 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
784 fvdw6 = _mm_mul_pd(c6_00,FF);
786 /* CUBIC SPLINE TABLE REPULSION */
787 vfitab = _mm_add_epi32(vfitab,ifour);
788 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
789 F = _mm_setzero_pd();
790 GMX_MM_TRANSPOSE2_PD(Y,F);
791 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
792 H = _mm_setzero_pd();
793 GMX_MM_TRANSPOSE2_PD(G,H);
794 Heps = _mm_mul_pd(vfeps,H);
795 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
796 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
797 vvdw12 = _mm_mul_pd(c12_00,VV);
798 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
799 fvdw12 = _mm_mul_pd(c12_00,FF);
800 vvdw = _mm_add_pd(vvdw12,vvdw6);
801 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
803 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
805 /* Update potential sum for this i atom from the interaction with this j atom. */
806 velec = _mm_and_pd(velec,cutoff_mask);
807 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
808 velecsum = _mm_add_pd(velecsum,velec);
809 vvdw = _mm_and_pd(vvdw,cutoff_mask);
810 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
811 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
813 fscal = _mm_add_pd(felec,fvdw);
815 fscal = _mm_and_pd(fscal,cutoff_mask);
817 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
819 /* Calculate temporary vectorial force */
820 tx = _mm_mul_pd(fscal,dx00);
821 ty = _mm_mul_pd(fscal,dy00);
822 tz = _mm_mul_pd(fscal,dz00);
824 /* Update vectorial force */
825 fix0 = _mm_add_pd(fix0,tx);
826 fiy0 = _mm_add_pd(fiy0,ty);
827 fiz0 = _mm_add_pd(fiz0,tz);
829 fjx0 = _mm_add_pd(fjx0,tx);
830 fjy0 = _mm_add_pd(fjy0,ty);
831 fjz0 = _mm_add_pd(fjz0,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 if (gmx_mm_any_lt(rsq01,rcutoff2))
842 /* REACTION-FIELD ELECTROSTATICS */
843 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
844 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
846 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_and_pd(velec,cutoff_mask);
850 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
851 velecsum = _mm_add_pd(velecsum,velec);
855 fscal = _mm_and_pd(fscal,cutoff_mask);
857 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
859 /* Calculate temporary vectorial force */
860 tx = _mm_mul_pd(fscal,dx01);
861 ty = _mm_mul_pd(fscal,dy01);
862 tz = _mm_mul_pd(fscal,dz01);
864 /* Update vectorial force */
865 fix0 = _mm_add_pd(fix0,tx);
866 fiy0 = _mm_add_pd(fiy0,ty);
867 fiz0 = _mm_add_pd(fiz0,tz);
869 fjx1 = _mm_add_pd(fjx1,tx);
870 fjy1 = _mm_add_pd(fjy1,ty);
871 fjz1 = _mm_add_pd(fjz1,tz);
875 /**************************
876 * CALCULATE INTERACTIONS *
877 **************************/
879 if (gmx_mm_any_lt(rsq02,rcutoff2))
882 /* REACTION-FIELD ELECTROSTATICS */
883 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
884 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
886 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
888 /* Update potential sum for this i atom from the interaction with this j atom. */
889 velec = _mm_and_pd(velec,cutoff_mask);
890 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
891 velecsum = _mm_add_pd(velecsum,velec);
895 fscal = _mm_and_pd(fscal,cutoff_mask);
897 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
899 /* Calculate temporary vectorial force */
900 tx = _mm_mul_pd(fscal,dx02);
901 ty = _mm_mul_pd(fscal,dy02);
902 tz = _mm_mul_pd(fscal,dz02);
904 /* Update vectorial force */
905 fix0 = _mm_add_pd(fix0,tx);
906 fiy0 = _mm_add_pd(fiy0,ty);
907 fiz0 = _mm_add_pd(fiz0,tz);
909 fjx2 = _mm_add_pd(fjx2,tx);
910 fjy2 = _mm_add_pd(fjy2,ty);
911 fjz2 = _mm_add_pd(fjz2,tz);
915 /**************************
916 * CALCULATE INTERACTIONS *
917 **************************/
919 if (gmx_mm_any_lt(rsq10,rcutoff2))
922 /* REACTION-FIELD ELECTROSTATICS */
923 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
924 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
926 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
928 /* Update potential sum for this i atom from the interaction with this j atom. */
929 velec = _mm_and_pd(velec,cutoff_mask);
930 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
931 velecsum = _mm_add_pd(velecsum,velec);
935 fscal = _mm_and_pd(fscal,cutoff_mask);
937 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
939 /* Calculate temporary vectorial force */
940 tx = _mm_mul_pd(fscal,dx10);
941 ty = _mm_mul_pd(fscal,dy10);
942 tz = _mm_mul_pd(fscal,dz10);
944 /* Update vectorial force */
945 fix1 = _mm_add_pd(fix1,tx);
946 fiy1 = _mm_add_pd(fiy1,ty);
947 fiz1 = _mm_add_pd(fiz1,tz);
949 fjx0 = _mm_add_pd(fjx0,tx);
950 fjy0 = _mm_add_pd(fjy0,ty);
951 fjz0 = _mm_add_pd(fjz0,tz);
955 /**************************
956 * CALCULATE INTERACTIONS *
957 **************************/
959 if (gmx_mm_any_lt(rsq11,rcutoff2))
962 /* REACTION-FIELD ELECTROSTATICS */
963 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
964 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
966 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
968 /* Update potential sum for this i atom from the interaction with this j atom. */
969 velec = _mm_and_pd(velec,cutoff_mask);
970 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
971 velecsum = _mm_add_pd(velecsum,velec);
975 fscal = _mm_and_pd(fscal,cutoff_mask);
977 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
979 /* Calculate temporary vectorial force */
980 tx = _mm_mul_pd(fscal,dx11);
981 ty = _mm_mul_pd(fscal,dy11);
982 tz = _mm_mul_pd(fscal,dz11);
984 /* Update vectorial force */
985 fix1 = _mm_add_pd(fix1,tx);
986 fiy1 = _mm_add_pd(fiy1,ty);
987 fiz1 = _mm_add_pd(fiz1,tz);
989 fjx1 = _mm_add_pd(fjx1,tx);
990 fjy1 = _mm_add_pd(fjy1,ty);
991 fjz1 = _mm_add_pd(fjz1,tz);
995 /**************************
996 * CALCULATE INTERACTIONS *
997 **************************/
999 if (gmx_mm_any_lt(rsq12,rcutoff2))
1002 /* REACTION-FIELD ELECTROSTATICS */
1003 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
1004 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1006 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1008 /* Update potential sum for this i atom from the interaction with this j atom. */
1009 velec = _mm_and_pd(velec,cutoff_mask);
1010 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1011 velecsum = _mm_add_pd(velecsum,velec);
1015 fscal = _mm_and_pd(fscal,cutoff_mask);
1017 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1019 /* Calculate temporary vectorial force */
1020 tx = _mm_mul_pd(fscal,dx12);
1021 ty = _mm_mul_pd(fscal,dy12);
1022 tz = _mm_mul_pd(fscal,dz12);
1024 /* Update vectorial force */
1025 fix1 = _mm_add_pd(fix1,tx);
1026 fiy1 = _mm_add_pd(fiy1,ty);
1027 fiz1 = _mm_add_pd(fiz1,tz);
1029 fjx2 = _mm_add_pd(fjx2,tx);
1030 fjy2 = _mm_add_pd(fjy2,ty);
1031 fjz2 = _mm_add_pd(fjz2,tz);
1035 /**************************
1036 * CALCULATE INTERACTIONS *
1037 **************************/
1039 if (gmx_mm_any_lt(rsq20,rcutoff2))
1042 /* REACTION-FIELD ELECTROSTATICS */
1043 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
1044 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1046 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1048 /* Update potential sum for this i atom from the interaction with this j atom. */
1049 velec = _mm_and_pd(velec,cutoff_mask);
1050 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1051 velecsum = _mm_add_pd(velecsum,velec);
1055 fscal = _mm_and_pd(fscal,cutoff_mask);
1057 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1059 /* Calculate temporary vectorial force */
1060 tx = _mm_mul_pd(fscal,dx20);
1061 ty = _mm_mul_pd(fscal,dy20);
1062 tz = _mm_mul_pd(fscal,dz20);
1064 /* Update vectorial force */
1065 fix2 = _mm_add_pd(fix2,tx);
1066 fiy2 = _mm_add_pd(fiy2,ty);
1067 fiz2 = _mm_add_pd(fiz2,tz);
1069 fjx0 = _mm_add_pd(fjx0,tx);
1070 fjy0 = _mm_add_pd(fjy0,ty);
1071 fjz0 = _mm_add_pd(fjz0,tz);
1075 /**************************
1076 * CALCULATE INTERACTIONS *
1077 **************************/
1079 if (gmx_mm_any_lt(rsq21,rcutoff2))
1082 /* REACTION-FIELD ELECTROSTATICS */
1083 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
1084 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1086 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1088 /* Update potential sum for this i atom from the interaction with this j atom. */
1089 velec = _mm_and_pd(velec,cutoff_mask);
1090 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1091 velecsum = _mm_add_pd(velecsum,velec);
1095 fscal = _mm_and_pd(fscal,cutoff_mask);
1097 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1099 /* Calculate temporary vectorial force */
1100 tx = _mm_mul_pd(fscal,dx21);
1101 ty = _mm_mul_pd(fscal,dy21);
1102 tz = _mm_mul_pd(fscal,dz21);
1104 /* Update vectorial force */
1105 fix2 = _mm_add_pd(fix2,tx);
1106 fiy2 = _mm_add_pd(fiy2,ty);
1107 fiz2 = _mm_add_pd(fiz2,tz);
1109 fjx1 = _mm_add_pd(fjx1,tx);
1110 fjy1 = _mm_add_pd(fjy1,ty);
1111 fjz1 = _mm_add_pd(fjz1,tz);
1115 /**************************
1116 * CALCULATE INTERACTIONS *
1117 **************************/
1119 if (gmx_mm_any_lt(rsq22,rcutoff2))
1122 /* REACTION-FIELD ELECTROSTATICS */
1123 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1124 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1126 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1128 /* Update potential sum for this i atom from the interaction with this j atom. */
1129 velec = _mm_and_pd(velec,cutoff_mask);
1130 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1131 velecsum = _mm_add_pd(velecsum,velec);
1135 fscal = _mm_and_pd(fscal,cutoff_mask);
1137 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1139 /* Calculate temporary vectorial force */
1140 tx = _mm_mul_pd(fscal,dx22);
1141 ty = _mm_mul_pd(fscal,dy22);
1142 tz = _mm_mul_pd(fscal,dz22);
1144 /* Update vectorial force */
1145 fix2 = _mm_add_pd(fix2,tx);
1146 fiy2 = _mm_add_pd(fiy2,ty);
1147 fiz2 = _mm_add_pd(fiz2,tz);
1149 fjx2 = _mm_add_pd(fjx2,tx);
1150 fjy2 = _mm_add_pd(fjy2,ty);
1151 fjz2 = _mm_add_pd(fjz2,tz);
1155 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1157 /* Inner loop uses 360 flops */
1160 /* End of innermost loop */
1162 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1163 f+i_coord_offset,fshift+i_shift_offset);
1166 /* Update potential energies */
1167 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1168 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1170 /* Increment number of inner iterations */
1171 inneriter += j_index_end - j_index_start;
1173 /* Outer loop uses 20 flops */
1176 /* Increment number of outer iterations */
1179 /* Update outer/inner flops */
1181 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*360);
1184 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double
1185 * Electrostatics interaction: ReactionField
1186 * VdW interaction: CubicSplineTable
1187 * Geometry: Water3-Water3
1188 * Calculate force/pot: Force
1191 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double
1192 (t_nblist * gmx_restrict nlist,
1193 rvec * gmx_restrict xx,
1194 rvec * gmx_restrict ff,
1195 t_forcerec * gmx_restrict fr,
1196 t_mdatoms * gmx_restrict mdatoms,
1197 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1198 t_nrnb * gmx_restrict nrnb)
1200 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1201 * just 0 for non-waters.
1202 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1203 * jnr indices corresponding to data put in the four positions in the SIMD register.
1205 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1206 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1208 int j_coord_offsetA,j_coord_offsetB;
1209 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1210 real rcutoff_scalar;
1211 real *shiftvec,*fshift,*x,*f;
1212 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1214 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1216 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1218 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1219 int vdwjidx0A,vdwjidx0B;
1220 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1221 int vdwjidx1A,vdwjidx1B;
1222 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1223 int vdwjidx2A,vdwjidx2B;
1224 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1225 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1226 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1227 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1228 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1229 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1230 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1231 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1232 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1233 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1234 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1237 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1240 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1241 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1243 __m128i ifour = _mm_set1_epi32(4);
1244 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1246 __m128d dummy_mask,cutoff_mask;
1247 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1248 __m128d one = _mm_set1_pd(1.0);
1249 __m128d two = _mm_set1_pd(2.0);
1255 jindex = nlist->jindex;
1257 shiftidx = nlist->shift;
1259 shiftvec = fr->shift_vec[0];
1260 fshift = fr->fshift[0];
1261 facel = _mm_set1_pd(fr->epsfac);
1262 charge = mdatoms->chargeA;
1263 krf = _mm_set1_pd(fr->ic->k_rf);
1264 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1265 crf = _mm_set1_pd(fr->ic->c_rf);
1266 nvdwtype = fr->ntype;
1267 vdwparam = fr->nbfp;
1268 vdwtype = mdatoms->typeA;
1270 vftab = kernel_data->table_vdw->data;
1271 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1273 /* Setup water-specific parameters */
1274 inr = nlist->iinr[0];
1275 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1276 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1277 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1278 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1280 jq0 = _mm_set1_pd(charge[inr+0]);
1281 jq1 = _mm_set1_pd(charge[inr+1]);
1282 jq2 = _mm_set1_pd(charge[inr+2]);
1283 vdwjidx0A = 2*vdwtype[inr+0];
1284 qq00 = _mm_mul_pd(iq0,jq0);
1285 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1286 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1287 qq01 = _mm_mul_pd(iq0,jq1);
1288 qq02 = _mm_mul_pd(iq0,jq2);
1289 qq10 = _mm_mul_pd(iq1,jq0);
1290 qq11 = _mm_mul_pd(iq1,jq1);
1291 qq12 = _mm_mul_pd(iq1,jq2);
1292 qq20 = _mm_mul_pd(iq2,jq0);
1293 qq21 = _mm_mul_pd(iq2,jq1);
1294 qq22 = _mm_mul_pd(iq2,jq2);
1296 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1297 rcutoff_scalar = fr->rcoulomb;
1298 rcutoff = _mm_set1_pd(rcutoff_scalar);
1299 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1301 /* Avoid stupid compiler warnings */
1303 j_coord_offsetA = 0;
1304 j_coord_offsetB = 0;
1309 /* Start outer loop over neighborlists */
1310 for(iidx=0; iidx<nri; iidx++)
1312 /* Load shift vector for this list */
1313 i_shift_offset = DIM*shiftidx[iidx];
1315 /* Load limits for loop over neighbors */
1316 j_index_start = jindex[iidx];
1317 j_index_end = jindex[iidx+1];
1319 /* Get outer coordinate index */
1321 i_coord_offset = DIM*inr;
1323 /* Load i particle coords and add shift vector */
1324 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1325 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1327 fix0 = _mm_setzero_pd();
1328 fiy0 = _mm_setzero_pd();
1329 fiz0 = _mm_setzero_pd();
1330 fix1 = _mm_setzero_pd();
1331 fiy1 = _mm_setzero_pd();
1332 fiz1 = _mm_setzero_pd();
1333 fix2 = _mm_setzero_pd();
1334 fiy2 = _mm_setzero_pd();
1335 fiz2 = _mm_setzero_pd();
1337 /* Start inner kernel loop */
1338 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1341 /* Get j neighbor index, and coordinate index */
1343 jnrB = jjnr[jidx+1];
1344 j_coord_offsetA = DIM*jnrA;
1345 j_coord_offsetB = DIM*jnrB;
1347 /* load j atom coordinates */
1348 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1349 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1351 /* Calculate displacement vector */
1352 dx00 = _mm_sub_pd(ix0,jx0);
1353 dy00 = _mm_sub_pd(iy0,jy0);
1354 dz00 = _mm_sub_pd(iz0,jz0);
1355 dx01 = _mm_sub_pd(ix0,jx1);
1356 dy01 = _mm_sub_pd(iy0,jy1);
1357 dz01 = _mm_sub_pd(iz0,jz1);
1358 dx02 = _mm_sub_pd(ix0,jx2);
1359 dy02 = _mm_sub_pd(iy0,jy2);
1360 dz02 = _mm_sub_pd(iz0,jz2);
1361 dx10 = _mm_sub_pd(ix1,jx0);
1362 dy10 = _mm_sub_pd(iy1,jy0);
1363 dz10 = _mm_sub_pd(iz1,jz0);
1364 dx11 = _mm_sub_pd(ix1,jx1);
1365 dy11 = _mm_sub_pd(iy1,jy1);
1366 dz11 = _mm_sub_pd(iz1,jz1);
1367 dx12 = _mm_sub_pd(ix1,jx2);
1368 dy12 = _mm_sub_pd(iy1,jy2);
1369 dz12 = _mm_sub_pd(iz1,jz2);
1370 dx20 = _mm_sub_pd(ix2,jx0);
1371 dy20 = _mm_sub_pd(iy2,jy0);
1372 dz20 = _mm_sub_pd(iz2,jz0);
1373 dx21 = _mm_sub_pd(ix2,jx1);
1374 dy21 = _mm_sub_pd(iy2,jy1);
1375 dz21 = _mm_sub_pd(iz2,jz1);
1376 dx22 = _mm_sub_pd(ix2,jx2);
1377 dy22 = _mm_sub_pd(iy2,jy2);
1378 dz22 = _mm_sub_pd(iz2,jz2);
1380 /* Calculate squared distance and things based on it */
1381 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1382 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1383 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1384 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1385 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1386 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1387 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1388 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1389 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1391 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1392 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1393 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1394 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1395 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1396 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1397 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1398 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1399 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1401 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1402 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1403 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1404 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1405 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1406 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1407 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1408 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1409 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1411 fjx0 = _mm_setzero_pd();
1412 fjy0 = _mm_setzero_pd();
1413 fjz0 = _mm_setzero_pd();
1414 fjx1 = _mm_setzero_pd();
1415 fjy1 = _mm_setzero_pd();
1416 fjz1 = _mm_setzero_pd();
1417 fjx2 = _mm_setzero_pd();
1418 fjy2 = _mm_setzero_pd();
1419 fjz2 = _mm_setzero_pd();
1421 /**************************
1422 * CALCULATE INTERACTIONS *
1423 **************************/
1425 if (gmx_mm_any_lt(rsq00,rcutoff2))
1428 r00 = _mm_mul_pd(rsq00,rinv00);
1430 /* Calculate table index by multiplying r with table scale and truncate to integer */
1431 rt = _mm_mul_pd(r00,vftabscale);
1432 vfitab = _mm_cvttpd_epi32(rt);
1433 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1434 vfitab = _mm_slli_epi32(vfitab,3);
1436 /* REACTION-FIELD ELECTROSTATICS */
1437 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1439 /* CUBIC SPLINE TABLE DISPERSION */
1440 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1441 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1442 GMX_MM_TRANSPOSE2_PD(Y,F);
1443 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1444 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1445 GMX_MM_TRANSPOSE2_PD(G,H);
1446 Heps = _mm_mul_pd(vfeps,H);
1447 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1448 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1449 fvdw6 = _mm_mul_pd(c6_00,FF);
1451 /* CUBIC SPLINE TABLE REPULSION */
1452 vfitab = _mm_add_epi32(vfitab,ifour);
1453 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1454 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1455 GMX_MM_TRANSPOSE2_PD(Y,F);
1456 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1457 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1458 GMX_MM_TRANSPOSE2_PD(G,H);
1459 Heps = _mm_mul_pd(vfeps,H);
1460 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1461 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1462 fvdw12 = _mm_mul_pd(c12_00,FF);
1463 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1465 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1467 fscal = _mm_add_pd(felec,fvdw);
1469 fscal = _mm_and_pd(fscal,cutoff_mask);
1471 /* Calculate temporary vectorial force */
1472 tx = _mm_mul_pd(fscal,dx00);
1473 ty = _mm_mul_pd(fscal,dy00);
1474 tz = _mm_mul_pd(fscal,dz00);
1476 /* Update vectorial force */
1477 fix0 = _mm_add_pd(fix0,tx);
1478 fiy0 = _mm_add_pd(fiy0,ty);
1479 fiz0 = _mm_add_pd(fiz0,tz);
1481 fjx0 = _mm_add_pd(fjx0,tx);
1482 fjy0 = _mm_add_pd(fjy0,ty);
1483 fjz0 = _mm_add_pd(fjz0,tz);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 if (gmx_mm_any_lt(rsq01,rcutoff2))
1494 /* REACTION-FIELD ELECTROSTATICS */
1495 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1497 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1501 fscal = _mm_and_pd(fscal,cutoff_mask);
1503 /* Calculate temporary vectorial force */
1504 tx = _mm_mul_pd(fscal,dx01);
1505 ty = _mm_mul_pd(fscal,dy01);
1506 tz = _mm_mul_pd(fscal,dz01);
1508 /* Update vectorial force */
1509 fix0 = _mm_add_pd(fix0,tx);
1510 fiy0 = _mm_add_pd(fiy0,ty);
1511 fiz0 = _mm_add_pd(fiz0,tz);
1513 fjx1 = _mm_add_pd(fjx1,tx);
1514 fjy1 = _mm_add_pd(fjy1,ty);
1515 fjz1 = _mm_add_pd(fjz1,tz);
1519 /**************************
1520 * CALCULATE INTERACTIONS *
1521 **************************/
1523 if (gmx_mm_any_lt(rsq02,rcutoff2))
1526 /* REACTION-FIELD ELECTROSTATICS */
1527 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1529 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1533 fscal = _mm_and_pd(fscal,cutoff_mask);
1535 /* Calculate temporary vectorial force */
1536 tx = _mm_mul_pd(fscal,dx02);
1537 ty = _mm_mul_pd(fscal,dy02);
1538 tz = _mm_mul_pd(fscal,dz02);
1540 /* Update vectorial force */
1541 fix0 = _mm_add_pd(fix0,tx);
1542 fiy0 = _mm_add_pd(fiy0,ty);
1543 fiz0 = _mm_add_pd(fiz0,tz);
1545 fjx2 = _mm_add_pd(fjx2,tx);
1546 fjy2 = _mm_add_pd(fjy2,ty);
1547 fjz2 = _mm_add_pd(fjz2,tz);
1551 /**************************
1552 * CALCULATE INTERACTIONS *
1553 **************************/
1555 if (gmx_mm_any_lt(rsq10,rcutoff2))
1558 /* REACTION-FIELD ELECTROSTATICS */
1559 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1561 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1565 fscal = _mm_and_pd(fscal,cutoff_mask);
1567 /* Calculate temporary vectorial force */
1568 tx = _mm_mul_pd(fscal,dx10);
1569 ty = _mm_mul_pd(fscal,dy10);
1570 tz = _mm_mul_pd(fscal,dz10);
1572 /* Update vectorial force */
1573 fix1 = _mm_add_pd(fix1,tx);
1574 fiy1 = _mm_add_pd(fiy1,ty);
1575 fiz1 = _mm_add_pd(fiz1,tz);
1577 fjx0 = _mm_add_pd(fjx0,tx);
1578 fjy0 = _mm_add_pd(fjy0,ty);
1579 fjz0 = _mm_add_pd(fjz0,tz);
1583 /**************************
1584 * CALCULATE INTERACTIONS *
1585 **************************/
1587 if (gmx_mm_any_lt(rsq11,rcutoff2))
1590 /* REACTION-FIELD ELECTROSTATICS */
1591 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1593 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1597 fscal = _mm_and_pd(fscal,cutoff_mask);
1599 /* Calculate temporary vectorial force */
1600 tx = _mm_mul_pd(fscal,dx11);
1601 ty = _mm_mul_pd(fscal,dy11);
1602 tz = _mm_mul_pd(fscal,dz11);
1604 /* Update vectorial force */
1605 fix1 = _mm_add_pd(fix1,tx);
1606 fiy1 = _mm_add_pd(fiy1,ty);
1607 fiz1 = _mm_add_pd(fiz1,tz);
1609 fjx1 = _mm_add_pd(fjx1,tx);
1610 fjy1 = _mm_add_pd(fjy1,ty);
1611 fjz1 = _mm_add_pd(fjz1,tz);
1615 /**************************
1616 * CALCULATE INTERACTIONS *
1617 **************************/
1619 if (gmx_mm_any_lt(rsq12,rcutoff2))
1622 /* REACTION-FIELD ELECTROSTATICS */
1623 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1625 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1629 fscal = _mm_and_pd(fscal,cutoff_mask);
1631 /* Calculate temporary vectorial force */
1632 tx = _mm_mul_pd(fscal,dx12);
1633 ty = _mm_mul_pd(fscal,dy12);
1634 tz = _mm_mul_pd(fscal,dz12);
1636 /* Update vectorial force */
1637 fix1 = _mm_add_pd(fix1,tx);
1638 fiy1 = _mm_add_pd(fiy1,ty);
1639 fiz1 = _mm_add_pd(fiz1,tz);
1641 fjx2 = _mm_add_pd(fjx2,tx);
1642 fjy2 = _mm_add_pd(fjy2,ty);
1643 fjz2 = _mm_add_pd(fjz2,tz);
1647 /**************************
1648 * CALCULATE INTERACTIONS *
1649 **************************/
1651 if (gmx_mm_any_lt(rsq20,rcutoff2))
1654 /* REACTION-FIELD ELECTROSTATICS */
1655 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1657 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1661 fscal = _mm_and_pd(fscal,cutoff_mask);
1663 /* Calculate temporary vectorial force */
1664 tx = _mm_mul_pd(fscal,dx20);
1665 ty = _mm_mul_pd(fscal,dy20);
1666 tz = _mm_mul_pd(fscal,dz20);
1668 /* Update vectorial force */
1669 fix2 = _mm_add_pd(fix2,tx);
1670 fiy2 = _mm_add_pd(fiy2,ty);
1671 fiz2 = _mm_add_pd(fiz2,tz);
1673 fjx0 = _mm_add_pd(fjx0,tx);
1674 fjy0 = _mm_add_pd(fjy0,ty);
1675 fjz0 = _mm_add_pd(fjz0,tz);
1679 /**************************
1680 * CALCULATE INTERACTIONS *
1681 **************************/
1683 if (gmx_mm_any_lt(rsq21,rcutoff2))
1686 /* REACTION-FIELD ELECTROSTATICS */
1687 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1689 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1693 fscal = _mm_and_pd(fscal,cutoff_mask);
1695 /* Calculate temporary vectorial force */
1696 tx = _mm_mul_pd(fscal,dx21);
1697 ty = _mm_mul_pd(fscal,dy21);
1698 tz = _mm_mul_pd(fscal,dz21);
1700 /* Update vectorial force */
1701 fix2 = _mm_add_pd(fix2,tx);
1702 fiy2 = _mm_add_pd(fiy2,ty);
1703 fiz2 = _mm_add_pd(fiz2,tz);
1705 fjx1 = _mm_add_pd(fjx1,tx);
1706 fjy1 = _mm_add_pd(fjy1,ty);
1707 fjz1 = _mm_add_pd(fjz1,tz);
1711 /**************************
1712 * CALCULATE INTERACTIONS *
1713 **************************/
1715 if (gmx_mm_any_lt(rsq22,rcutoff2))
1718 /* REACTION-FIELD ELECTROSTATICS */
1719 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1721 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1725 fscal = _mm_and_pd(fscal,cutoff_mask);
1727 /* Calculate temporary vectorial force */
1728 tx = _mm_mul_pd(fscal,dx22);
1729 ty = _mm_mul_pd(fscal,dy22);
1730 tz = _mm_mul_pd(fscal,dz22);
1732 /* Update vectorial force */
1733 fix2 = _mm_add_pd(fix2,tx);
1734 fiy2 = _mm_add_pd(fiy2,ty);
1735 fiz2 = _mm_add_pd(fiz2,tz);
1737 fjx2 = _mm_add_pd(fjx2,tx);
1738 fjy2 = _mm_add_pd(fjy2,ty);
1739 fjz2 = _mm_add_pd(fjz2,tz);
1743 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1745 /* Inner loop uses 297 flops */
1748 if(jidx<j_index_end)
1752 j_coord_offsetA = DIM*jnrA;
1754 /* load j atom coordinates */
1755 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1756 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1758 /* Calculate displacement vector */
1759 dx00 = _mm_sub_pd(ix0,jx0);
1760 dy00 = _mm_sub_pd(iy0,jy0);
1761 dz00 = _mm_sub_pd(iz0,jz0);
1762 dx01 = _mm_sub_pd(ix0,jx1);
1763 dy01 = _mm_sub_pd(iy0,jy1);
1764 dz01 = _mm_sub_pd(iz0,jz1);
1765 dx02 = _mm_sub_pd(ix0,jx2);
1766 dy02 = _mm_sub_pd(iy0,jy2);
1767 dz02 = _mm_sub_pd(iz0,jz2);
1768 dx10 = _mm_sub_pd(ix1,jx0);
1769 dy10 = _mm_sub_pd(iy1,jy0);
1770 dz10 = _mm_sub_pd(iz1,jz0);
1771 dx11 = _mm_sub_pd(ix1,jx1);
1772 dy11 = _mm_sub_pd(iy1,jy1);
1773 dz11 = _mm_sub_pd(iz1,jz1);
1774 dx12 = _mm_sub_pd(ix1,jx2);
1775 dy12 = _mm_sub_pd(iy1,jy2);
1776 dz12 = _mm_sub_pd(iz1,jz2);
1777 dx20 = _mm_sub_pd(ix2,jx0);
1778 dy20 = _mm_sub_pd(iy2,jy0);
1779 dz20 = _mm_sub_pd(iz2,jz0);
1780 dx21 = _mm_sub_pd(ix2,jx1);
1781 dy21 = _mm_sub_pd(iy2,jy1);
1782 dz21 = _mm_sub_pd(iz2,jz1);
1783 dx22 = _mm_sub_pd(ix2,jx2);
1784 dy22 = _mm_sub_pd(iy2,jy2);
1785 dz22 = _mm_sub_pd(iz2,jz2);
1787 /* Calculate squared distance and things based on it */
1788 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1789 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1790 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1791 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1792 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1793 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1794 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1795 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1796 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1798 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1799 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1800 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1801 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1802 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1803 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1804 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1805 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1806 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1808 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1809 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1810 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1811 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1812 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1813 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1814 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1815 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1816 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1818 fjx0 = _mm_setzero_pd();
1819 fjy0 = _mm_setzero_pd();
1820 fjz0 = _mm_setzero_pd();
1821 fjx1 = _mm_setzero_pd();
1822 fjy1 = _mm_setzero_pd();
1823 fjz1 = _mm_setzero_pd();
1824 fjx2 = _mm_setzero_pd();
1825 fjy2 = _mm_setzero_pd();
1826 fjz2 = _mm_setzero_pd();
1828 /**************************
1829 * CALCULATE INTERACTIONS *
1830 **************************/
1832 if (gmx_mm_any_lt(rsq00,rcutoff2))
1835 r00 = _mm_mul_pd(rsq00,rinv00);
1837 /* Calculate table index by multiplying r with table scale and truncate to integer */
1838 rt = _mm_mul_pd(r00,vftabscale);
1839 vfitab = _mm_cvttpd_epi32(rt);
1840 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1841 vfitab = _mm_slli_epi32(vfitab,3);
1843 /* REACTION-FIELD ELECTROSTATICS */
1844 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1846 /* CUBIC SPLINE TABLE DISPERSION */
1847 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1848 F = _mm_setzero_pd();
1849 GMX_MM_TRANSPOSE2_PD(Y,F);
1850 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1851 H = _mm_setzero_pd();
1852 GMX_MM_TRANSPOSE2_PD(G,H);
1853 Heps = _mm_mul_pd(vfeps,H);
1854 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1855 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1856 fvdw6 = _mm_mul_pd(c6_00,FF);
1858 /* CUBIC SPLINE TABLE REPULSION */
1859 vfitab = _mm_add_epi32(vfitab,ifour);
1860 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1861 F = _mm_setzero_pd();
1862 GMX_MM_TRANSPOSE2_PD(Y,F);
1863 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1864 H = _mm_setzero_pd();
1865 GMX_MM_TRANSPOSE2_PD(G,H);
1866 Heps = _mm_mul_pd(vfeps,H);
1867 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1868 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1869 fvdw12 = _mm_mul_pd(c12_00,FF);
1870 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1872 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1874 fscal = _mm_add_pd(felec,fvdw);
1876 fscal = _mm_and_pd(fscal,cutoff_mask);
1878 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1880 /* Calculate temporary vectorial force */
1881 tx = _mm_mul_pd(fscal,dx00);
1882 ty = _mm_mul_pd(fscal,dy00);
1883 tz = _mm_mul_pd(fscal,dz00);
1885 /* Update vectorial force */
1886 fix0 = _mm_add_pd(fix0,tx);
1887 fiy0 = _mm_add_pd(fiy0,ty);
1888 fiz0 = _mm_add_pd(fiz0,tz);
1890 fjx0 = _mm_add_pd(fjx0,tx);
1891 fjy0 = _mm_add_pd(fjy0,ty);
1892 fjz0 = _mm_add_pd(fjz0,tz);
1896 /**************************
1897 * CALCULATE INTERACTIONS *
1898 **************************/
1900 if (gmx_mm_any_lt(rsq01,rcutoff2))
1903 /* REACTION-FIELD ELECTROSTATICS */
1904 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1906 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1910 fscal = _mm_and_pd(fscal,cutoff_mask);
1912 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1914 /* Calculate temporary vectorial force */
1915 tx = _mm_mul_pd(fscal,dx01);
1916 ty = _mm_mul_pd(fscal,dy01);
1917 tz = _mm_mul_pd(fscal,dz01);
1919 /* Update vectorial force */
1920 fix0 = _mm_add_pd(fix0,tx);
1921 fiy0 = _mm_add_pd(fiy0,ty);
1922 fiz0 = _mm_add_pd(fiz0,tz);
1924 fjx1 = _mm_add_pd(fjx1,tx);
1925 fjy1 = _mm_add_pd(fjy1,ty);
1926 fjz1 = _mm_add_pd(fjz1,tz);
1930 /**************************
1931 * CALCULATE INTERACTIONS *
1932 **************************/
1934 if (gmx_mm_any_lt(rsq02,rcutoff2))
1937 /* REACTION-FIELD ELECTROSTATICS */
1938 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1940 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1944 fscal = _mm_and_pd(fscal,cutoff_mask);
1946 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1948 /* Calculate temporary vectorial force */
1949 tx = _mm_mul_pd(fscal,dx02);
1950 ty = _mm_mul_pd(fscal,dy02);
1951 tz = _mm_mul_pd(fscal,dz02);
1953 /* Update vectorial force */
1954 fix0 = _mm_add_pd(fix0,tx);
1955 fiy0 = _mm_add_pd(fiy0,ty);
1956 fiz0 = _mm_add_pd(fiz0,tz);
1958 fjx2 = _mm_add_pd(fjx2,tx);
1959 fjy2 = _mm_add_pd(fjy2,ty);
1960 fjz2 = _mm_add_pd(fjz2,tz);
1964 /**************************
1965 * CALCULATE INTERACTIONS *
1966 **************************/
1968 if (gmx_mm_any_lt(rsq10,rcutoff2))
1971 /* REACTION-FIELD ELECTROSTATICS */
1972 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1974 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1978 fscal = _mm_and_pd(fscal,cutoff_mask);
1980 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1982 /* Calculate temporary vectorial force */
1983 tx = _mm_mul_pd(fscal,dx10);
1984 ty = _mm_mul_pd(fscal,dy10);
1985 tz = _mm_mul_pd(fscal,dz10);
1987 /* Update vectorial force */
1988 fix1 = _mm_add_pd(fix1,tx);
1989 fiy1 = _mm_add_pd(fiy1,ty);
1990 fiz1 = _mm_add_pd(fiz1,tz);
1992 fjx0 = _mm_add_pd(fjx0,tx);
1993 fjy0 = _mm_add_pd(fjy0,ty);
1994 fjz0 = _mm_add_pd(fjz0,tz);
1998 /**************************
1999 * CALCULATE INTERACTIONS *
2000 **************************/
2002 if (gmx_mm_any_lt(rsq11,rcutoff2))
2005 /* REACTION-FIELD ELECTROSTATICS */
2006 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
2008 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2012 fscal = _mm_and_pd(fscal,cutoff_mask);
2014 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2016 /* Calculate temporary vectorial force */
2017 tx = _mm_mul_pd(fscal,dx11);
2018 ty = _mm_mul_pd(fscal,dy11);
2019 tz = _mm_mul_pd(fscal,dz11);
2021 /* Update vectorial force */
2022 fix1 = _mm_add_pd(fix1,tx);
2023 fiy1 = _mm_add_pd(fiy1,ty);
2024 fiz1 = _mm_add_pd(fiz1,tz);
2026 fjx1 = _mm_add_pd(fjx1,tx);
2027 fjy1 = _mm_add_pd(fjy1,ty);
2028 fjz1 = _mm_add_pd(fjz1,tz);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 if (gmx_mm_any_lt(rsq12,rcutoff2))
2039 /* REACTION-FIELD ELECTROSTATICS */
2040 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
2042 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2046 fscal = _mm_and_pd(fscal,cutoff_mask);
2048 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2050 /* Calculate temporary vectorial force */
2051 tx = _mm_mul_pd(fscal,dx12);
2052 ty = _mm_mul_pd(fscal,dy12);
2053 tz = _mm_mul_pd(fscal,dz12);
2055 /* Update vectorial force */
2056 fix1 = _mm_add_pd(fix1,tx);
2057 fiy1 = _mm_add_pd(fiy1,ty);
2058 fiz1 = _mm_add_pd(fiz1,tz);
2060 fjx2 = _mm_add_pd(fjx2,tx);
2061 fjy2 = _mm_add_pd(fjy2,ty);
2062 fjz2 = _mm_add_pd(fjz2,tz);
2066 /**************************
2067 * CALCULATE INTERACTIONS *
2068 **************************/
2070 if (gmx_mm_any_lt(rsq20,rcutoff2))
2073 /* REACTION-FIELD ELECTROSTATICS */
2074 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
2076 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2080 fscal = _mm_and_pd(fscal,cutoff_mask);
2082 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2084 /* Calculate temporary vectorial force */
2085 tx = _mm_mul_pd(fscal,dx20);
2086 ty = _mm_mul_pd(fscal,dy20);
2087 tz = _mm_mul_pd(fscal,dz20);
2089 /* Update vectorial force */
2090 fix2 = _mm_add_pd(fix2,tx);
2091 fiy2 = _mm_add_pd(fiy2,ty);
2092 fiz2 = _mm_add_pd(fiz2,tz);
2094 fjx0 = _mm_add_pd(fjx0,tx);
2095 fjy0 = _mm_add_pd(fjy0,ty);
2096 fjz0 = _mm_add_pd(fjz0,tz);
2100 /**************************
2101 * CALCULATE INTERACTIONS *
2102 **************************/
2104 if (gmx_mm_any_lt(rsq21,rcutoff2))
2107 /* REACTION-FIELD ELECTROSTATICS */
2108 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
2110 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2114 fscal = _mm_and_pd(fscal,cutoff_mask);
2116 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2118 /* Calculate temporary vectorial force */
2119 tx = _mm_mul_pd(fscal,dx21);
2120 ty = _mm_mul_pd(fscal,dy21);
2121 tz = _mm_mul_pd(fscal,dz21);
2123 /* Update vectorial force */
2124 fix2 = _mm_add_pd(fix2,tx);
2125 fiy2 = _mm_add_pd(fiy2,ty);
2126 fiz2 = _mm_add_pd(fiz2,tz);
2128 fjx1 = _mm_add_pd(fjx1,tx);
2129 fjy1 = _mm_add_pd(fjy1,ty);
2130 fjz1 = _mm_add_pd(fjz1,tz);
2134 /**************************
2135 * CALCULATE INTERACTIONS *
2136 **************************/
2138 if (gmx_mm_any_lt(rsq22,rcutoff2))
2141 /* REACTION-FIELD ELECTROSTATICS */
2142 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2144 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2148 fscal = _mm_and_pd(fscal,cutoff_mask);
2150 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2152 /* Calculate temporary vectorial force */
2153 tx = _mm_mul_pd(fscal,dx22);
2154 ty = _mm_mul_pd(fscal,dy22);
2155 tz = _mm_mul_pd(fscal,dz22);
2157 /* Update vectorial force */
2158 fix2 = _mm_add_pd(fix2,tx);
2159 fiy2 = _mm_add_pd(fiy2,ty);
2160 fiz2 = _mm_add_pd(fiz2,tz);
2162 fjx2 = _mm_add_pd(fjx2,tx);
2163 fjy2 = _mm_add_pd(fjy2,ty);
2164 fjz2 = _mm_add_pd(fjz2,tz);
2168 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2170 /* Inner loop uses 297 flops */
2173 /* End of innermost loop */
2175 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2176 f+i_coord_offset,fshift+i_shift_offset);
2178 /* Increment number of inner iterations */
2179 inneriter += j_index_end - j_index_start;
2181 /* Outer loop uses 18 flops */
2184 /* Increment number of outer iterations */
2187 /* Update outer/inner flops */
2189 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);