2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse4_1_double
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse4_1_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 int vdwjidx0A,vdwjidx0B;
87 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88 int vdwjidx1A,vdwjidx1B;
89 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B;
91 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
104 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
110 __m128i ifour = _mm_set1_epi32(4);
111 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
113 __m128d dummy_mask,cutoff_mask;
114 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one = _mm_set1_pd(1.0);
116 __m128d two = _mm_set1_pd(2.0);
122 jindex = nlist->jindex;
124 shiftidx = nlist->shift;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm_set1_pd(fr->epsfac);
129 charge = mdatoms->chargeA;
130 krf = _mm_set1_pd(fr->ic->k_rf);
131 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
132 crf = _mm_set1_pd(fr->ic->c_rf);
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
137 vftab = kernel_data->table_vdw->data;
138 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
143 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq0 = _mm_set1_pd(charge[inr+0]);
148 jq1 = _mm_set1_pd(charge[inr+1]);
149 jq2 = _mm_set1_pd(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _mm_mul_pd(iq0,jq0);
152 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _mm_mul_pd(iq0,jq1);
155 qq02 = _mm_mul_pd(iq0,jq2);
156 qq10 = _mm_mul_pd(iq1,jq0);
157 qq11 = _mm_mul_pd(iq1,jq1);
158 qq12 = _mm_mul_pd(iq1,jq2);
159 qq20 = _mm_mul_pd(iq2,jq0);
160 qq21 = _mm_mul_pd(iq2,jq1);
161 qq22 = _mm_mul_pd(iq2,jq2);
163 /* Avoid stupid compiler warnings */
171 /* Start outer loop over neighborlists */
172 for(iidx=0; iidx<nri; iidx++)
174 /* Load shift vector for this list */
175 i_shift_offset = DIM*shiftidx[iidx];
177 /* Load limits for loop over neighbors */
178 j_index_start = jindex[iidx];
179 j_index_end = jindex[iidx+1];
181 /* Get outer coordinate index */
183 i_coord_offset = DIM*inr;
185 /* Load i particle coords and add shift vector */
186 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
187 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
189 fix0 = _mm_setzero_pd();
190 fiy0 = _mm_setzero_pd();
191 fiz0 = _mm_setzero_pd();
192 fix1 = _mm_setzero_pd();
193 fiy1 = _mm_setzero_pd();
194 fiz1 = _mm_setzero_pd();
195 fix2 = _mm_setzero_pd();
196 fiy2 = _mm_setzero_pd();
197 fiz2 = _mm_setzero_pd();
199 /* Reset potential sums */
200 velecsum = _mm_setzero_pd();
201 vvdwsum = _mm_setzero_pd();
203 /* Start inner kernel loop */
204 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
207 /* Get j neighbor index, and coordinate index */
210 j_coord_offsetA = DIM*jnrA;
211 j_coord_offsetB = DIM*jnrB;
213 /* load j atom coordinates */
214 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
215 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
217 /* Calculate displacement vector */
218 dx00 = _mm_sub_pd(ix0,jx0);
219 dy00 = _mm_sub_pd(iy0,jy0);
220 dz00 = _mm_sub_pd(iz0,jz0);
221 dx01 = _mm_sub_pd(ix0,jx1);
222 dy01 = _mm_sub_pd(iy0,jy1);
223 dz01 = _mm_sub_pd(iz0,jz1);
224 dx02 = _mm_sub_pd(ix0,jx2);
225 dy02 = _mm_sub_pd(iy0,jy2);
226 dz02 = _mm_sub_pd(iz0,jz2);
227 dx10 = _mm_sub_pd(ix1,jx0);
228 dy10 = _mm_sub_pd(iy1,jy0);
229 dz10 = _mm_sub_pd(iz1,jz0);
230 dx11 = _mm_sub_pd(ix1,jx1);
231 dy11 = _mm_sub_pd(iy1,jy1);
232 dz11 = _mm_sub_pd(iz1,jz1);
233 dx12 = _mm_sub_pd(ix1,jx2);
234 dy12 = _mm_sub_pd(iy1,jy2);
235 dz12 = _mm_sub_pd(iz1,jz2);
236 dx20 = _mm_sub_pd(ix2,jx0);
237 dy20 = _mm_sub_pd(iy2,jy0);
238 dz20 = _mm_sub_pd(iz2,jz0);
239 dx21 = _mm_sub_pd(ix2,jx1);
240 dy21 = _mm_sub_pd(iy2,jy1);
241 dz21 = _mm_sub_pd(iz2,jz1);
242 dx22 = _mm_sub_pd(ix2,jx2);
243 dy22 = _mm_sub_pd(iy2,jy2);
244 dz22 = _mm_sub_pd(iz2,jz2);
246 /* Calculate squared distance and things based on it */
247 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
248 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
249 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
250 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
251 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
252 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
253 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
254 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
255 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
257 rinv00 = gmx_mm_invsqrt_pd(rsq00);
258 rinv01 = gmx_mm_invsqrt_pd(rsq01);
259 rinv02 = gmx_mm_invsqrt_pd(rsq02);
260 rinv10 = gmx_mm_invsqrt_pd(rsq10);
261 rinv11 = gmx_mm_invsqrt_pd(rsq11);
262 rinv12 = gmx_mm_invsqrt_pd(rsq12);
263 rinv20 = gmx_mm_invsqrt_pd(rsq20);
264 rinv21 = gmx_mm_invsqrt_pd(rsq21);
265 rinv22 = gmx_mm_invsqrt_pd(rsq22);
267 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
268 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
269 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
270 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
271 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
272 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
273 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
274 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
275 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
277 fjx0 = _mm_setzero_pd();
278 fjy0 = _mm_setzero_pd();
279 fjz0 = _mm_setzero_pd();
280 fjx1 = _mm_setzero_pd();
281 fjy1 = _mm_setzero_pd();
282 fjz1 = _mm_setzero_pd();
283 fjx2 = _mm_setzero_pd();
284 fjy2 = _mm_setzero_pd();
285 fjz2 = _mm_setzero_pd();
287 /**************************
288 * CALCULATE INTERACTIONS *
289 **************************/
291 r00 = _mm_mul_pd(rsq00,rinv00);
293 /* Calculate table index by multiplying r with table scale and truncate to integer */
294 rt = _mm_mul_pd(r00,vftabscale);
295 vfitab = _mm_cvttpd_epi32(rt);
296 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
297 vfitab = _mm_slli_epi32(vfitab,3);
299 /* REACTION-FIELD ELECTROSTATICS */
300 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
301 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
303 /* CUBIC SPLINE TABLE DISPERSION */
304 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
305 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
306 GMX_MM_TRANSPOSE2_PD(Y,F);
307 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
308 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
309 GMX_MM_TRANSPOSE2_PD(G,H);
310 Heps = _mm_mul_pd(vfeps,H);
311 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
312 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
313 vvdw6 = _mm_mul_pd(c6_00,VV);
314 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
315 fvdw6 = _mm_mul_pd(c6_00,FF);
317 /* CUBIC SPLINE TABLE REPULSION */
318 vfitab = _mm_add_epi32(vfitab,ifour);
319 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
320 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
321 GMX_MM_TRANSPOSE2_PD(Y,F);
322 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
323 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
324 GMX_MM_TRANSPOSE2_PD(G,H);
325 Heps = _mm_mul_pd(vfeps,H);
326 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
327 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
328 vvdw12 = _mm_mul_pd(c12_00,VV);
329 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
330 fvdw12 = _mm_mul_pd(c12_00,FF);
331 vvdw = _mm_add_pd(vvdw12,vvdw6);
332 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
334 /* Update potential sum for this i atom from the interaction with this j atom. */
335 velecsum = _mm_add_pd(velecsum,velec);
336 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
338 fscal = _mm_add_pd(felec,fvdw);
340 /* Calculate temporary vectorial force */
341 tx = _mm_mul_pd(fscal,dx00);
342 ty = _mm_mul_pd(fscal,dy00);
343 tz = _mm_mul_pd(fscal,dz00);
345 /* Update vectorial force */
346 fix0 = _mm_add_pd(fix0,tx);
347 fiy0 = _mm_add_pd(fiy0,ty);
348 fiz0 = _mm_add_pd(fiz0,tz);
350 fjx0 = _mm_add_pd(fjx0,tx);
351 fjy0 = _mm_add_pd(fjy0,ty);
352 fjz0 = _mm_add_pd(fjz0,tz);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 /* REACTION-FIELD ELECTROSTATICS */
359 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
360 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velecsum = _mm_add_pd(velecsum,velec);
367 /* Calculate temporary vectorial force */
368 tx = _mm_mul_pd(fscal,dx01);
369 ty = _mm_mul_pd(fscal,dy01);
370 tz = _mm_mul_pd(fscal,dz01);
372 /* Update vectorial force */
373 fix0 = _mm_add_pd(fix0,tx);
374 fiy0 = _mm_add_pd(fiy0,ty);
375 fiz0 = _mm_add_pd(fiz0,tz);
377 fjx1 = _mm_add_pd(fjx1,tx);
378 fjy1 = _mm_add_pd(fjy1,ty);
379 fjz1 = _mm_add_pd(fjz1,tz);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 /* REACTION-FIELD ELECTROSTATICS */
386 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
387 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velecsum = _mm_add_pd(velecsum,velec);
394 /* Calculate temporary vectorial force */
395 tx = _mm_mul_pd(fscal,dx02);
396 ty = _mm_mul_pd(fscal,dy02);
397 tz = _mm_mul_pd(fscal,dz02);
399 /* Update vectorial force */
400 fix0 = _mm_add_pd(fix0,tx);
401 fiy0 = _mm_add_pd(fiy0,ty);
402 fiz0 = _mm_add_pd(fiz0,tz);
404 fjx2 = _mm_add_pd(fjx2,tx);
405 fjy2 = _mm_add_pd(fjy2,ty);
406 fjz2 = _mm_add_pd(fjz2,tz);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 /* REACTION-FIELD ELECTROSTATICS */
413 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
414 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velecsum = _mm_add_pd(velecsum,velec);
421 /* Calculate temporary vectorial force */
422 tx = _mm_mul_pd(fscal,dx10);
423 ty = _mm_mul_pd(fscal,dy10);
424 tz = _mm_mul_pd(fscal,dz10);
426 /* Update vectorial force */
427 fix1 = _mm_add_pd(fix1,tx);
428 fiy1 = _mm_add_pd(fiy1,ty);
429 fiz1 = _mm_add_pd(fiz1,tz);
431 fjx0 = _mm_add_pd(fjx0,tx);
432 fjy0 = _mm_add_pd(fjy0,ty);
433 fjz0 = _mm_add_pd(fjz0,tz);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 /* REACTION-FIELD ELECTROSTATICS */
440 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
441 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm_add_pd(velecsum,velec);
448 /* Calculate temporary vectorial force */
449 tx = _mm_mul_pd(fscal,dx11);
450 ty = _mm_mul_pd(fscal,dy11);
451 tz = _mm_mul_pd(fscal,dz11);
453 /* Update vectorial force */
454 fix1 = _mm_add_pd(fix1,tx);
455 fiy1 = _mm_add_pd(fiy1,ty);
456 fiz1 = _mm_add_pd(fiz1,tz);
458 fjx1 = _mm_add_pd(fjx1,tx);
459 fjy1 = _mm_add_pd(fjy1,ty);
460 fjz1 = _mm_add_pd(fjz1,tz);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 /* REACTION-FIELD ELECTROSTATICS */
467 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
468 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm_add_pd(velecsum,velec);
475 /* Calculate temporary vectorial force */
476 tx = _mm_mul_pd(fscal,dx12);
477 ty = _mm_mul_pd(fscal,dy12);
478 tz = _mm_mul_pd(fscal,dz12);
480 /* Update vectorial force */
481 fix1 = _mm_add_pd(fix1,tx);
482 fiy1 = _mm_add_pd(fiy1,ty);
483 fiz1 = _mm_add_pd(fiz1,tz);
485 fjx2 = _mm_add_pd(fjx2,tx);
486 fjy2 = _mm_add_pd(fjy2,ty);
487 fjz2 = _mm_add_pd(fjz2,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 /* REACTION-FIELD ELECTROSTATICS */
494 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
495 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velecsum = _mm_add_pd(velecsum,velec);
502 /* Calculate temporary vectorial force */
503 tx = _mm_mul_pd(fscal,dx20);
504 ty = _mm_mul_pd(fscal,dy20);
505 tz = _mm_mul_pd(fscal,dz20);
507 /* Update vectorial force */
508 fix2 = _mm_add_pd(fix2,tx);
509 fiy2 = _mm_add_pd(fiy2,ty);
510 fiz2 = _mm_add_pd(fiz2,tz);
512 fjx0 = _mm_add_pd(fjx0,tx);
513 fjy0 = _mm_add_pd(fjy0,ty);
514 fjz0 = _mm_add_pd(fjz0,tz);
516 /**************************
517 * CALCULATE INTERACTIONS *
518 **************************/
520 /* REACTION-FIELD ELECTROSTATICS */
521 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
522 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
524 /* Update potential sum for this i atom from the interaction with this j atom. */
525 velecsum = _mm_add_pd(velecsum,velec);
529 /* Calculate temporary vectorial force */
530 tx = _mm_mul_pd(fscal,dx21);
531 ty = _mm_mul_pd(fscal,dy21);
532 tz = _mm_mul_pd(fscal,dz21);
534 /* Update vectorial force */
535 fix2 = _mm_add_pd(fix2,tx);
536 fiy2 = _mm_add_pd(fiy2,ty);
537 fiz2 = _mm_add_pd(fiz2,tz);
539 fjx1 = _mm_add_pd(fjx1,tx);
540 fjy1 = _mm_add_pd(fjy1,ty);
541 fjz1 = _mm_add_pd(fjz1,tz);
543 /**************************
544 * CALCULATE INTERACTIONS *
545 **************************/
547 /* REACTION-FIELD ELECTROSTATICS */
548 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
549 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
551 /* Update potential sum for this i atom from the interaction with this j atom. */
552 velecsum = _mm_add_pd(velecsum,velec);
556 /* Calculate temporary vectorial force */
557 tx = _mm_mul_pd(fscal,dx22);
558 ty = _mm_mul_pd(fscal,dy22);
559 tz = _mm_mul_pd(fscal,dz22);
561 /* Update vectorial force */
562 fix2 = _mm_add_pd(fix2,tx);
563 fiy2 = _mm_add_pd(fiy2,ty);
564 fiz2 = _mm_add_pd(fiz2,tz);
566 fjx2 = _mm_add_pd(fjx2,tx);
567 fjy2 = _mm_add_pd(fjy2,ty);
568 fjz2 = _mm_add_pd(fjz2,tz);
570 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
572 /* Inner loop uses 323 flops */
579 j_coord_offsetA = DIM*jnrA;
581 /* load j atom coordinates */
582 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
583 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
585 /* Calculate displacement vector */
586 dx00 = _mm_sub_pd(ix0,jx0);
587 dy00 = _mm_sub_pd(iy0,jy0);
588 dz00 = _mm_sub_pd(iz0,jz0);
589 dx01 = _mm_sub_pd(ix0,jx1);
590 dy01 = _mm_sub_pd(iy0,jy1);
591 dz01 = _mm_sub_pd(iz0,jz1);
592 dx02 = _mm_sub_pd(ix0,jx2);
593 dy02 = _mm_sub_pd(iy0,jy2);
594 dz02 = _mm_sub_pd(iz0,jz2);
595 dx10 = _mm_sub_pd(ix1,jx0);
596 dy10 = _mm_sub_pd(iy1,jy0);
597 dz10 = _mm_sub_pd(iz1,jz0);
598 dx11 = _mm_sub_pd(ix1,jx1);
599 dy11 = _mm_sub_pd(iy1,jy1);
600 dz11 = _mm_sub_pd(iz1,jz1);
601 dx12 = _mm_sub_pd(ix1,jx2);
602 dy12 = _mm_sub_pd(iy1,jy2);
603 dz12 = _mm_sub_pd(iz1,jz2);
604 dx20 = _mm_sub_pd(ix2,jx0);
605 dy20 = _mm_sub_pd(iy2,jy0);
606 dz20 = _mm_sub_pd(iz2,jz0);
607 dx21 = _mm_sub_pd(ix2,jx1);
608 dy21 = _mm_sub_pd(iy2,jy1);
609 dz21 = _mm_sub_pd(iz2,jz1);
610 dx22 = _mm_sub_pd(ix2,jx2);
611 dy22 = _mm_sub_pd(iy2,jy2);
612 dz22 = _mm_sub_pd(iz2,jz2);
614 /* Calculate squared distance and things based on it */
615 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
616 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
617 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
618 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
619 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
620 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
621 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
622 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
623 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
625 rinv00 = gmx_mm_invsqrt_pd(rsq00);
626 rinv01 = gmx_mm_invsqrt_pd(rsq01);
627 rinv02 = gmx_mm_invsqrt_pd(rsq02);
628 rinv10 = gmx_mm_invsqrt_pd(rsq10);
629 rinv11 = gmx_mm_invsqrt_pd(rsq11);
630 rinv12 = gmx_mm_invsqrt_pd(rsq12);
631 rinv20 = gmx_mm_invsqrt_pd(rsq20);
632 rinv21 = gmx_mm_invsqrt_pd(rsq21);
633 rinv22 = gmx_mm_invsqrt_pd(rsq22);
635 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
636 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
637 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
638 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
639 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
640 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
641 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
642 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
643 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
645 fjx0 = _mm_setzero_pd();
646 fjy0 = _mm_setzero_pd();
647 fjz0 = _mm_setzero_pd();
648 fjx1 = _mm_setzero_pd();
649 fjy1 = _mm_setzero_pd();
650 fjz1 = _mm_setzero_pd();
651 fjx2 = _mm_setzero_pd();
652 fjy2 = _mm_setzero_pd();
653 fjz2 = _mm_setzero_pd();
655 /**************************
656 * CALCULATE INTERACTIONS *
657 **************************/
659 r00 = _mm_mul_pd(rsq00,rinv00);
661 /* Calculate table index by multiplying r with table scale and truncate to integer */
662 rt = _mm_mul_pd(r00,vftabscale);
663 vfitab = _mm_cvttpd_epi32(rt);
664 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
665 vfitab = _mm_slli_epi32(vfitab,3);
667 /* REACTION-FIELD ELECTROSTATICS */
668 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
669 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
671 /* CUBIC SPLINE TABLE DISPERSION */
672 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
673 F = _mm_setzero_pd();
674 GMX_MM_TRANSPOSE2_PD(Y,F);
675 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
676 H = _mm_setzero_pd();
677 GMX_MM_TRANSPOSE2_PD(G,H);
678 Heps = _mm_mul_pd(vfeps,H);
679 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
680 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
681 vvdw6 = _mm_mul_pd(c6_00,VV);
682 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
683 fvdw6 = _mm_mul_pd(c6_00,FF);
685 /* CUBIC SPLINE TABLE REPULSION */
686 vfitab = _mm_add_epi32(vfitab,ifour);
687 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
688 F = _mm_setzero_pd();
689 GMX_MM_TRANSPOSE2_PD(Y,F);
690 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
691 H = _mm_setzero_pd();
692 GMX_MM_TRANSPOSE2_PD(G,H);
693 Heps = _mm_mul_pd(vfeps,H);
694 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
695 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
696 vvdw12 = _mm_mul_pd(c12_00,VV);
697 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
698 fvdw12 = _mm_mul_pd(c12_00,FF);
699 vvdw = _mm_add_pd(vvdw12,vvdw6);
700 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
702 /* Update potential sum for this i atom from the interaction with this j atom. */
703 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
704 velecsum = _mm_add_pd(velecsum,velec);
705 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
706 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
708 fscal = _mm_add_pd(felec,fvdw);
710 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
712 /* Calculate temporary vectorial force */
713 tx = _mm_mul_pd(fscal,dx00);
714 ty = _mm_mul_pd(fscal,dy00);
715 tz = _mm_mul_pd(fscal,dz00);
717 /* Update vectorial force */
718 fix0 = _mm_add_pd(fix0,tx);
719 fiy0 = _mm_add_pd(fiy0,ty);
720 fiz0 = _mm_add_pd(fiz0,tz);
722 fjx0 = _mm_add_pd(fjx0,tx);
723 fjy0 = _mm_add_pd(fjy0,ty);
724 fjz0 = _mm_add_pd(fjz0,tz);
726 /**************************
727 * CALCULATE INTERACTIONS *
728 **************************/
730 /* REACTION-FIELD ELECTROSTATICS */
731 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
732 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
734 /* Update potential sum for this i atom from the interaction with this j atom. */
735 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
736 velecsum = _mm_add_pd(velecsum,velec);
740 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
742 /* Calculate temporary vectorial force */
743 tx = _mm_mul_pd(fscal,dx01);
744 ty = _mm_mul_pd(fscal,dy01);
745 tz = _mm_mul_pd(fscal,dz01);
747 /* Update vectorial force */
748 fix0 = _mm_add_pd(fix0,tx);
749 fiy0 = _mm_add_pd(fiy0,ty);
750 fiz0 = _mm_add_pd(fiz0,tz);
752 fjx1 = _mm_add_pd(fjx1,tx);
753 fjy1 = _mm_add_pd(fjy1,ty);
754 fjz1 = _mm_add_pd(fjz1,tz);
756 /**************************
757 * CALCULATE INTERACTIONS *
758 **************************/
760 /* REACTION-FIELD ELECTROSTATICS */
761 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
762 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
764 /* Update potential sum for this i atom from the interaction with this j atom. */
765 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
766 velecsum = _mm_add_pd(velecsum,velec);
770 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
772 /* Calculate temporary vectorial force */
773 tx = _mm_mul_pd(fscal,dx02);
774 ty = _mm_mul_pd(fscal,dy02);
775 tz = _mm_mul_pd(fscal,dz02);
777 /* Update vectorial force */
778 fix0 = _mm_add_pd(fix0,tx);
779 fiy0 = _mm_add_pd(fiy0,ty);
780 fiz0 = _mm_add_pd(fiz0,tz);
782 fjx2 = _mm_add_pd(fjx2,tx);
783 fjy2 = _mm_add_pd(fjy2,ty);
784 fjz2 = _mm_add_pd(fjz2,tz);
786 /**************************
787 * CALCULATE INTERACTIONS *
788 **************************/
790 /* REACTION-FIELD ELECTROSTATICS */
791 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
792 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
794 /* Update potential sum for this i atom from the interaction with this j atom. */
795 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
796 velecsum = _mm_add_pd(velecsum,velec);
800 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
802 /* Calculate temporary vectorial force */
803 tx = _mm_mul_pd(fscal,dx10);
804 ty = _mm_mul_pd(fscal,dy10);
805 tz = _mm_mul_pd(fscal,dz10);
807 /* Update vectorial force */
808 fix1 = _mm_add_pd(fix1,tx);
809 fiy1 = _mm_add_pd(fiy1,ty);
810 fiz1 = _mm_add_pd(fiz1,tz);
812 fjx0 = _mm_add_pd(fjx0,tx);
813 fjy0 = _mm_add_pd(fjy0,ty);
814 fjz0 = _mm_add_pd(fjz0,tz);
816 /**************************
817 * CALCULATE INTERACTIONS *
818 **************************/
820 /* REACTION-FIELD ELECTROSTATICS */
821 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
822 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
824 /* Update potential sum for this i atom from the interaction with this j atom. */
825 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
826 velecsum = _mm_add_pd(velecsum,velec);
830 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
832 /* Calculate temporary vectorial force */
833 tx = _mm_mul_pd(fscal,dx11);
834 ty = _mm_mul_pd(fscal,dy11);
835 tz = _mm_mul_pd(fscal,dz11);
837 /* Update vectorial force */
838 fix1 = _mm_add_pd(fix1,tx);
839 fiy1 = _mm_add_pd(fiy1,ty);
840 fiz1 = _mm_add_pd(fiz1,tz);
842 fjx1 = _mm_add_pd(fjx1,tx);
843 fjy1 = _mm_add_pd(fjy1,ty);
844 fjz1 = _mm_add_pd(fjz1,tz);
846 /**************************
847 * CALCULATE INTERACTIONS *
848 **************************/
850 /* REACTION-FIELD ELECTROSTATICS */
851 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
852 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
854 /* Update potential sum for this i atom from the interaction with this j atom. */
855 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
856 velecsum = _mm_add_pd(velecsum,velec);
860 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
862 /* Calculate temporary vectorial force */
863 tx = _mm_mul_pd(fscal,dx12);
864 ty = _mm_mul_pd(fscal,dy12);
865 tz = _mm_mul_pd(fscal,dz12);
867 /* Update vectorial force */
868 fix1 = _mm_add_pd(fix1,tx);
869 fiy1 = _mm_add_pd(fiy1,ty);
870 fiz1 = _mm_add_pd(fiz1,tz);
872 fjx2 = _mm_add_pd(fjx2,tx);
873 fjy2 = _mm_add_pd(fjy2,ty);
874 fjz2 = _mm_add_pd(fjz2,tz);
876 /**************************
877 * CALCULATE INTERACTIONS *
878 **************************/
880 /* REACTION-FIELD ELECTROSTATICS */
881 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
882 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
884 /* Update potential sum for this i atom from the interaction with this j atom. */
885 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
886 velecsum = _mm_add_pd(velecsum,velec);
890 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
892 /* Calculate temporary vectorial force */
893 tx = _mm_mul_pd(fscal,dx20);
894 ty = _mm_mul_pd(fscal,dy20);
895 tz = _mm_mul_pd(fscal,dz20);
897 /* Update vectorial force */
898 fix2 = _mm_add_pd(fix2,tx);
899 fiy2 = _mm_add_pd(fiy2,ty);
900 fiz2 = _mm_add_pd(fiz2,tz);
902 fjx0 = _mm_add_pd(fjx0,tx);
903 fjy0 = _mm_add_pd(fjy0,ty);
904 fjz0 = _mm_add_pd(fjz0,tz);
906 /**************************
907 * CALCULATE INTERACTIONS *
908 **************************/
910 /* REACTION-FIELD ELECTROSTATICS */
911 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
912 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
914 /* Update potential sum for this i atom from the interaction with this j atom. */
915 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
916 velecsum = _mm_add_pd(velecsum,velec);
920 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
922 /* Calculate temporary vectorial force */
923 tx = _mm_mul_pd(fscal,dx21);
924 ty = _mm_mul_pd(fscal,dy21);
925 tz = _mm_mul_pd(fscal,dz21);
927 /* Update vectorial force */
928 fix2 = _mm_add_pd(fix2,tx);
929 fiy2 = _mm_add_pd(fiy2,ty);
930 fiz2 = _mm_add_pd(fiz2,tz);
932 fjx1 = _mm_add_pd(fjx1,tx);
933 fjy1 = _mm_add_pd(fjy1,ty);
934 fjz1 = _mm_add_pd(fjz1,tz);
936 /**************************
937 * CALCULATE INTERACTIONS *
938 **************************/
940 /* REACTION-FIELD ELECTROSTATICS */
941 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
942 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
944 /* Update potential sum for this i atom from the interaction with this j atom. */
945 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
946 velecsum = _mm_add_pd(velecsum,velec);
950 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
952 /* Calculate temporary vectorial force */
953 tx = _mm_mul_pd(fscal,dx22);
954 ty = _mm_mul_pd(fscal,dy22);
955 tz = _mm_mul_pd(fscal,dz22);
957 /* Update vectorial force */
958 fix2 = _mm_add_pd(fix2,tx);
959 fiy2 = _mm_add_pd(fiy2,ty);
960 fiz2 = _mm_add_pd(fiz2,tz);
962 fjx2 = _mm_add_pd(fjx2,tx);
963 fjy2 = _mm_add_pd(fjy2,ty);
964 fjz2 = _mm_add_pd(fjz2,tz);
966 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
968 /* Inner loop uses 323 flops */
971 /* End of innermost loop */
973 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
974 f+i_coord_offset,fshift+i_shift_offset);
977 /* Update potential energies */
978 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
979 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
981 /* Increment number of inner iterations */
982 inneriter += j_index_end - j_index_start;
984 /* Outer loop uses 20 flops */
987 /* Increment number of outer iterations */
990 /* Update outer/inner flops */
992 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*323);
995 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_double
996 * Electrostatics interaction: ReactionField
997 * VdW interaction: CubicSplineTable
998 * Geometry: Water3-Water3
999 * Calculate force/pot: Force
1002 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_double
1003 (t_nblist * gmx_restrict nlist,
1004 rvec * gmx_restrict xx,
1005 rvec * gmx_restrict ff,
1006 t_forcerec * gmx_restrict fr,
1007 t_mdatoms * gmx_restrict mdatoms,
1008 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1009 t_nrnb * gmx_restrict nrnb)
1011 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1012 * just 0 for non-waters.
1013 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1014 * jnr indices corresponding to data put in the four positions in the SIMD register.
1016 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1017 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1019 int j_coord_offsetA,j_coord_offsetB;
1020 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1021 real rcutoff_scalar;
1022 real *shiftvec,*fshift,*x,*f;
1023 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1025 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1027 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1029 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1030 int vdwjidx0A,vdwjidx0B;
1031 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1032 int vdwjidx1A,vdwjidx1B;
1033 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1034 int vdwjidx2A,vdwjidx2B;
1035 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1036 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1037 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1038 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1039 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1040 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1041 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1042 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1043 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1044 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1045 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1048 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1051 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1052 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1054 __m128i ifour = _mm_set1_epi32(4);
1055 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1057 __m128d dummy_mask,cutoff_mask;
1058 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1059 __m128d one = _mm_set1_pd(1.0);
1060 __m128d two = _mm_set1_pd(2.0);
1066 jindex = nlist->jindex;
1068 shiftidx = nlist->shift;
1070 shiftvec = fr->shift_vec[0];
1071 fshift = fr->fshift[0];
1072 facel = _mm_set1_pd(fr->epsfac);
1073 charge = mdatoms->chargeA;
1074 krf = _mm_set1_pd(fr->ic->k_rf);
1075 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1076 crf = _mm_set1_pd(fr->ic->c_rf);
1077 nvdwtype = fr->ntype;
1078 vdwparam = fr->nbfp;
1079 vdwtype = mdatoms->typeA;
1081 vftab = kernel_data->table_vdw->data;
1082 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1084 /* Setup water-specific parameters */
1085 inr = nlist->iinr[0];
1086 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1087 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1088 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1089 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1091 jq0 = _mm_set1_pd(charge[inr+0]);
1092 jq1 = _mm_set1_pd(charge[inr+1]);
1093 jq2 = _mm_set1_pd(charge[inr+2]);
1094 vdwjidx0A = 2*vdwtype[inr+0];
1095 qq00 = _mm_mul_pd(iq0,jq0);
1096 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1097 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1098 qq01 = _mm_mul_pd(iq0,jq1);
1099 qq02 = _mm_mul_pd(iq0,jq2);
1100 qq10 = _mm_mul_pd(iq1,jq0);
1101 qq11 = _mm_mul_pd(iq1,jq1);
1102 qq12 = _mm_mul_pd(iq1,jq2);
1103 qq20 = _mm_mul_pd(iq2,jq0);
1104 qq21 = _mm_mul_pd(iq2,jq1);
1105 qq22 = _mm_mul_pd(iq2,jq2);
1107 /* Avoid stupid compiler warnings */
1109 j_coord_offsetA = 0;
1110 j_coord_offsetB = 0;
1115 /* Start outer loop over neighborlists */
1116 for(iidx=0; iidx<nri; iidx++)
1118 /* Load shift vector for this list */
1119 i_shift_offset = DIM*shiftidx[iidx];
1121 /* Load limits for loop over neighbors */
1122 j_index_start = jindex[iidx];
1123 j_index_end = jindex[iidx+1];
1125 /* Get outer coordinate index */
1127 i_coord_offset = DIM*inr;
1129 /* Load i particle coords and add shift vector */
1130 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1131 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1133 fix0 = _mm_setzero_pd();
1134 fiy0 = _mm_setzero_pd();
1135 fiz0 = _mm_setzero_pd();
1136 fix1 = _mm_setzero_pd();
1137 fiy1 = _mm_setzero_pd();
1138 fiz1 = _mm_setzero_pd();
1139 fix2 = _mm_setzero_pd();
1140 fiy2 = _mm_setzero_pd();
1141 fiz2 = _mm_setzero_pd();
1143 /* Start inner kernel loop */
1144 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1147 /* Get j neighbor index, and coordinate index */
1149 jnrB = jjnr[jidx+1];
1150 j_coord_offsetA = DIM*jnrA;
1151 j_coord_offsetB = DIM*jnrB;
1153 /* load j atom coordinates */
1154 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1155 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1157 /* Calculate displacement vector */
1158 dx00 = _mm_sub_pd(ix0,jx0);
1159 dy00 = _mm_sub_pd(iy0,jy0);
1160 dz00 = _mm_sub_pd(iz0,jz0);
1161 dx01 = _mm_sub_pd(ix0,jx1);
1162 dy01 = _mm_sub_pd(iy0,jy1);
1163 dz01 = _mm_sub_pd(iz0,jz1);
1164 dx02 = _mm_sub_pd(ix0,jx2);
1165 dy02 = _mm_sub_pd(iy0,jy2);
1166 dz02 = _mm_sub_pd(iz0,jz2);
1167 dx10 = _mm_sub_pd(ix1,jx0);
1168 dy10 = _mm_sub_pd(iy1,jy0);
1169 dz10 = _mm_sub_pd(iz1,jz0);
1170 dx11 = _mm_sub_pd(ix1,jx1);
1171 dy11 = _mm_sub_pd(iy1,jy1);
1172 dz11 = _mm_sub_pd(iz1,jz1);
1173 dx12 = _mm_sub_pd(ix1,jx2);
1174 dy12 = _mm_sub_pd(iy1,jy2);
1175 dz12 = _mm_sub_pd(iz1,jz2);
1176 dx20 = _mm_sub_pd(ix2,jx0);
1177 dy20 = _mm_sub_pd(iy2,jy0);
1178 dz20 = _mm_sub_pd(iz2,jz0);
1179 dx21 = _mm_sub_pd(ix2,jx1);
1180 dy21 = _mm_sub_pd(iy2,jy1);
1181 dz21 = _mm_sub_pd(iz2,jz1);
1182 dx22 = _mm_sub_pd(ix2,jx2);
1183 dy22 = _mm_sub_pd(iy2,jy2);
1184 dz22 = _mm_sub_pd(iz2,jz2);
1186 /* Calculate squared distance and things based on it */
1187 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1188 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1189 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1190 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1191 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1192 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1193 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1194 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1195 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1197 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1198 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1199 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1200 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1201 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1202 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1203 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1204 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1205 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1207 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1208 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1209 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1210 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1211 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1212 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1213 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1214 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1215 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1217 fjx0 = _mm_setzero_pd();
1218 fjy0 = _mm_setzero_pd();
1219 fjz0 = _mm_setzero_pd();
1220 fjx1 = _mm_setzero_pd();
1221 fjy1 = _mm_setzero_pd();
1222 fjz1 = _mm_setzero_pd();
1223 fjx2 = _mm_setzero_pd();
1224 fjy2 = _mm_setzero_pd();
1225 fjz2 = _mm_setzero_pd();
1227 /**************************
1228 * CALCULATE INTERACTIONS *
1229 **************************/
1231 r00 = _mm_mul_pd(rsq00,rinv00);
1233 /* Calculate table index by multiplying r with table scale and truncate to integer */
1234 rt = _mm_mul_pd(r00,vftabscale);
1235 vfitab = _mm_cvttpd_epi32(rt);
1236 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1237 vfitab = _mm_slli_epi32(vfitab,3);
1239 /* REACTION-FIELD ELECTROSTATICS */
1240 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1242 /* CUBIC SPLINE TABLE DISPERSION */
1243 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1244 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1245 GMX_MM_TRANSPOSE2_PD(Y,F);
1246 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1247 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1248 GMX_MM_TRANSPOSE2_PD(G,H);
1249 Heps = _mm_mul_pd(vfeps,H);
1250 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1251 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1252 fvdw6 = _mm_mul_pd(c6_00,FF);
1254 /* CUBIC SPLINE TABLE REPULSION */
1255 vfitab = _mm_add_epi32(vfitab,ifour);
1256 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1257 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1258 GMX_MM_TRANSPOSE2_PD(Y,F);
1259 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1260 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1261 GMX_MM_TRANSPOSE2_PD(G,H);
1262 Heps = _mm_mul_pd(vfeps,H);
1263 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1264 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1265 fvdw12 = _mm_mul_pd(c12_00,FF);
1266 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1268 fscal = _mm_add_pd(felec,fvdw);
1270 /* Calculate temporary vectorial force */
1271 tx = _mm_mul_pd(fscal,dx00);
1272 ty = _mm_mul_pd(fscal,dy00);
1273 tz = _mm_mul_pd(fscal,dz00);
1275 /* Update vectorial force */
1276 fix0 = _mm_add_pd(fix0,tx);
1277 fiy0 = _mm_add_pd(fiy0,ty);
1278 fiz0 = _mm_add_pd(fiz0,tz);
1280 fjx0 = _mm_add_pd(fjx0,tx);
1281 fjy0 = _mm_add_pd(fjy0,ty);
1282 fjz0 = _mm_add_pd(fjz0,tz);
1284 /**************************
1285 * CALCULATE INTERACTIONS *
1286 **************************/
1288 /* REACTION-FIELD ELECTROSTATICS */
1289 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1293 /* Calculate temporary vectorial force */
1294 tx = _mm_mul_pd(fscal,dx01);
1295 ty = _mm_mul_pd(fscal,dy01);
1296 tz = _mm_mul_pd(fscal,dz01);
1298 /* Update vectorial force */
1299 fix0 = _mm_add_pd(fix0,tx);
1300 fiy0 = _mm_add_pd(fiy0,ty);
1301 fiz0 = _mm_add_pd(fiz0,tz);
1303 fjx1 = _mm_add_pd(fjx1,tx);
1304 fjy1 = _mm_add_pd(fjy1,ty);
1305 fjz1 = _mm_add_pd(fjz1,tz);
1307 /**************************
1308 * CALCULATE INTERACTIONS *
1309 **************************/
1311 /* REACTION-FIELD ELECTROSTATICS */
1312 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1316 /* Calculate temporary vectorial force */
1317 tx = _mm_mul_pd(fscal,dx02);
1318 ty = _mm_mul_pd(fscal,dy02);
1319 tz = _mm_mul_pd(fscal,dz02);
1321 /* Update vectorial force */
1322 fix0 = _mm_add_pd(fix0,tx);
1323 fiy0 = _mm_add_pd(fiy0,ty);
1324 fiz0 = _mm_add_pd(fiz0,tz);
1326 fjx2 = _mm_add_pd(fjx2,tx);
1327 fjy2 = _mm_add_pd(fjy2,ty);
1328 fjz2 = _mm_add_pd(fjz2,tz);
1330 /**************************
1331 * CALCULATE INTERACTIONS *
1332 **************************/
1334 /* REACTION-FIELD ELECTROSTATICS */
1335 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1339 /* Calculate temporary vectorial force */
1340 tx = _mm_mul_pd(fscal,dx10);
1341 ty = _mm_mul_pd(fscal,dy10);
1342 tz = _mm_mul_pd(fscal,dz10);
1344 /* Update vectorial force */
1345 fix1 = _mm_add_pd(fix1,tx);
1346 fiy1 = _mm_add_pd(fiy1,ty);
1347 fiz1 = _mm_add_pd(fiz1,tz);
1349 fjx0 = _mm_add_pd(fjx0,tx);
1350 fjy0 = _mm_add_pd(fjy0,ty);
1351 fjz0 = _mm_add_pd(fjz0,tz);
1353 /**************************
1354 * CALCULATE INTERACTIONS *
1355 **************************/
1357 /* REACTION-FIELD ELECTROSTATICS */
1358 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1362 /* Calculate temporary vectorial force */
1363 tx = _mm_mul_pd(fscal,dx11);
1364 ty = _mm_mul_pd(fscal,dy11);
1365 tz = _mm_mul_pd(fscal,dz11);
1367 /* Update vectorial force */
1368 fix1 = _mm_add_pd(fix1,tx);
1369 fiy1 = _mm_add_pd(fiy1,ty);
1370 fiz1 = _mm_add_pd(fiz1,tz);
1372 fjx1 = _mm_add_pd(fjx1,tx);
1373 fjy1 = _mm_add_pd(fjy1,ty);
1374 fjz1 = _mm_add_pd(fjz1,tz);
1376 /**************************
1377 * CALCULATE INTERACTIONS *
1378 **************************/
1380 /* REACTION-FIELD ELECTROSTATICS */
1381 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1385 /* Calculate temporary vectorial force */
1386 tx = _mm_mul_pd(fscal,dx12);
1387 ty = _mm_mul_pd(fscal,dy12);
1388 tz = _mm_mul_pd(fscal,dz12);
1390 /* Update vectorial force */
1391 fix1 = _mm_add_pd(fix1,tx);
1392 fiy1 = _mm_add_pd(fiy1,ty);
1393 fiz1 = _mm_add_pd(fiz1,tz);
1395 fjx2 = _mm_add_pd(fjx2,tx);
1396 fjy2 = _mm_add_pd(fjy2,ty);
1397 fjz2 = _mm_add_pd(fjz2,tz);
1399 /**************************
1400 * CALCULATE INTERACTIONS *
1401 **************************/
1403 /* REACTION-FIELD ELECTROSTATICS */
1404 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1408 /* Calculate temporary vectorial force */
1409 tx = _mm_mul_pd(fscal,dx20);
1410 ty = _mm_mul_pd(fscal,dy20);
1411 tz = _mm_mul_pd(fscal,dz20);
1413 /* Update vectorial force */
1414 fix2 = _mm_add_pd(fix2,tx);
1415 fiy2 = _mm_add_pd(fiy2,ty);
1416 fiz2 = _mm_add_pd(fiz2,tz);
1418 fjx0 = _mm_add_pd(fjx0,tx);
1419 fjy0 = _mm_add_pd(fjy0,ty);
1420 fjz0 = _mm_add_pd(fjz0,tz);
1422 /**************************
1423 * CALCULATE INTERACTIONS *
1424 **************************/
1426 /* REACTION-FIELD ELECTROSTATICS */
1427 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1431 /* Calculate temporary vectorial force */
1432 tx = _mm_mul_pd(fscal,dx21);
1433 ty = _mm_mul_pd(fscal,dy21);
1434 tz = _mm_mul_pd(fscal,dz21);
1436 /* Update vectorial force */
1437 fix2 = _mm_add_pd(fix2,tx);
1438 fiy2 = _mm_add_pd(fiy2,ty);
1439 fiz2 = _mm_add_pd(fiz2,tz);
1441 fjx1 = _mm_add_pd(fjx1,tx);
1442 fjy1 = _mm_add_pd(fjy1,ty);
1443 fjz1 = _mm_add_pd(fjz1,tz);
1445 /**************************
1446 * CALCULATE INTERACTIONS *
1447 **************************/
1449 /* REACTION-FIELD ELECTROSTATICS */
1450 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1454 /* Calculate temporary vectorial force */
1455 tx = _mm_mul_pd(fscal,dx22);
1456 ty = _mm_mul_pd(fscal,dy22);
1457 tz = _mm_mul_pd(fscal,dz22);
1459 /* Update vectorial force */
1460 fix2 = _mm_add_pd(fix2,tx);
1461 fiy2 = _mm_add_pd(fiy2,ty);
1462 fiz2 = _mm_add_pd(fiz2,tz);
1464 fjx2 = _mm_add_pd(fjx2,tx);
1465 fjy2 = _mm_add_pd(fjy2,ty);
1466 fjz2 = _mm_add_pd(fjz2,tz);
1468 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1470 /* Inner loop uses 270 flops */
1473 if(jidx<j_index_end)
1477 j_coord_offsetA = DIM*jnrA;
1479 /* load j atom coordinates */
1480 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1481 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1483 /* Calculate displacement vector */
1484 dx00 = _mm_sub_pd(ix0,jx0);
1485 dy00 = _mm_sub_pd(iy0,jy0);
1486 dz00 = _mm_sub_pd(iz0,jz0);
1487 dx01 = _mm_sub_pd(ix0,jx1);
1488 dy01 = _mm_sub_pd(iy0,jy1);
1489 dz01 = _mm_sub_pd(iz0,jz1);
1490 dx02 = _mm_sub_pd(ix0,jx2);
1491 dy02 = _mm_sub_pd(iy0,jy2);
1492 dz02 = _mm_sub_pd(iz0,jz2);
1493 dx10 = _mm_sub_pd(ix1,jx0);
1494 dy10 = _mm_sub_pd(iy1,jy0);
1495 dz10 = _mm_sub_pd(iz1,jz0);
1496 dx11 = _mm_sub_pd(ix1,jx1);
1497 dy11 = _mm_sub_pd(iy1,jy1);
1498 dz11 = _mm_sub_pd(iz1,jz1);
1499 dx12 = _mm_sub_pd(ix1,jx2);
1500 dy12 = _mm_sub_pd(iy1,jy2);
1501 dz12 = _mm_sub_pd(iz1,jz2);
1502 dx20 = _mm_sub_pd(ix2,jx0);
1503 dy20 = _mm_sub_pd(iy2,jy0);
1504 dz20 = _mm_sub_pd(iz2,jz0);
1505 dx21 = _mm_sub_pd(ix2,jx1);
1506 dy21 = _mm_sub_pd(iy2,jy1);
1507 dz21 = _mm_sub_pd(iz2,jz1);
1508 dx22 = _mm_sub_pd(ix2,jx2);
1509 dy22 = _mm_sub_pd(iy2,jy2);
1510 dz22 = _mm_sub_pd(iz2,jz2);
1512 /* Calculate squared distance and things based on it */
1513 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1514 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1515 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1516 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1517 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1518 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1519 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1520 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1521 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1523 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1524 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1525 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1526 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1527 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1528 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1529 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1530 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1531 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1533 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1534 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1535 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1536 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1537 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1538 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1539 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1540 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1541 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1543 fjx0 = _mm_setzero_pd();
1544 fjy0 = _mm_setzero_pd();
1545 fjz0 = _mm_setzero_pd();
1546 fjx1 = _mm_setzero_pd();
1547 fjy1 = _mm_setzero_pd();
1548 fjz1 = _mm_setzero_pd();
1549 fjx2 = _mm_setzero_pd();
1550 fjy2 = _mm_setzero_pd();
1551 fjz2 = _mm_setzero_pd();
1553 /**************************
1554 * CALCULATE INTERACTIONS *
1555 **************************/
1557 r00 = _mm_mul_pd(rsq00,rinv00);
1559 /* Calculate table index by multiplying r with table scale and truncate to integer */
1560 rt = _mm_mul_pd(r00,vftabscale);
1561 vfitab = _mm_cvttpd_epi32(rt);
1562 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1563 vfitab = _mm_slli_epi32(vfitab,3);
1565 /* REACTION-FIELD ELECTROSTATICS */
1566 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1568 /* CUBIC SPLINE TABLE DISPERSION */
1569 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1570 F = _mm_setzero_pd();
1571 GMX_MM_TRANSPOSE2_PD(Y,F);
1572 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1573 H = _mm_setzero_pd();
1574 GMX_MM_TRANSPOSE2_PD(G,H);
1575 Heps = _mm_mul_pd(vfeps,H);
1576 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1577 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1578 fvdw6 = _mm_mul_pd(c6_00,FF);
1580 /* CUBIC SPLINE TABLE REPULSION */
1581 vfitab = _mm_add_epi32(vfitab,ifour);
1582 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1583 F = _mm_setzero_pd();
1584 GMX_MM_TRANSPOSE2_PD(Y,F);
1585 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1586 H = _mm_setzero_pd();
1587 GMX_MM_TRANSPOSE2_PD(G,H);
1588 Heps = _mm_mul_pd(vfeps,H);
1589 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1590 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1591 fvdw12 = _mm_mul_pd(c12_00,FF);
1592 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1594 fscal = _mm_add_pd(felec,fvdw);
1596 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1598 /* Calculate temporary vectorial force */
1599 tx = _mm_mul_pd(fscal,dx00);
1600 ty = _mm_mul_pd(fscal,dy00);
1601 tz = _mm_mul_pd(fscal,dz00);
1603 /* Update vectorial force */
1604 fix0 = _mm_add_pd(fix0,tx);
1605 fiy0 = _mm_add_pd(fiy0,ty);
1606 fiz0 = _mm_add_pd(fiz0,tz);
1608 fjx0 = _mm_add_pd(fjx0,tx);
1609 fjy0 = _mm_add_pd(fjy0,ty);
1610 fjz0 = _mm_add_pd(fjz0,tz);
1612 /**************************
1613 * CALCULATE INTERACTIONS *
1614 **************************/
1616 /* REACTION-FIELD ELECTROSTATICS */
1617 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1621 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1623 /* Calculate temporary vectorial force */
1624 tx = _mm_mul_pd(fscal,dx01);
1625 ty = _mm_mul_pd(fscal,dy01);
1626 tz = _mm_mul_pd(fscal,dz01);
1628 /* Update vectorial force */
1629 fix0 = _mm_add_pd(fix0,tx);
1630 fiy0 = _mm_add_pd(fiy0,ty);
1631 fiz0 = _mm_add_pd(fiz0,tz);
1633 fjx1 = _mm_add_pd(fjx1,tx);
1634 fjy1 = _mm_add_pd(fjy1,ty);
1635 fjz1 = _mm_add_pd(fjz1,tz);
1637 /**************************
1638 * CALCULATE INTERACTIONS *
1639 **************************/
1641 /* REACTION-FIELD ELECTROSTATICS */
1642 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1646 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1648 /* Calculate temporary vectorial force */
1649 tx = _mm_mul_pd(fscal,dx02);
1650 ty = _mm_mul_pd(fscal,dy02);
1651 tz = _mm_mul_pd(fscal,dz02);
1653 /* Update vectorial force */
1654 fix0 = _mm_add_pd(fix0,tx);
1655 fiy0 = _mm_add_pd(fiy0,ty);
1656 fiz0 = _mm_add_pd(fiz0,tz);
1658 fjx2 = _mm_add_pd(fjx2,tx);
1659 fjy2 = _mm_add_pd(fjy2,ty);
1660 fjz2 = _mm_add_pd(fjz2,tz);
1662 /**************************
1663 * CALCULATE INTERACTIONS *
1664 **************************/
1666 /* REACTION-FIELD ELECTROSTATICS */
1667 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1671 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1673 /* Calculate temporary vectorial force */
1674 tx = _mm_mul_pd(fscal,dx10);
1675 ty = _mm_mul_pd(fscal,dy10);
1676 tz = _mm_mul_pd(fscal,dz10);
1678 /* Update vectorial force */
1679 fix1 = _mm_add_pd(fix1,tx);
1680 fiy1 = _mm_add_pd(fiy1,ty);
1681 fiz1 = _mm_add_pd(fiz1,tz);
1683 fjx0 = _mm_add_pd(fjx0,tx);
1684 fjy0 = _mm_add_pd(fjy0,ty);
1685 fjz0 = _mm_add_pd(fjz0,tz);
1687 /**************************
1688 * CALCULATE INTERACTIONS *
1689 **************************/
1691 /* REACTION-FIELD ELECTROSTATICS */
1692 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1696 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1698 /* Calculate temporary vectorial force */
1699 tx = _mm_mul_pd(fscal,dx11);
1700 ty = _mm_mul_pd(fscal,dy11);
1701 tz = _mm_mul_pd(fscal,dz11);
1703 /* Update vectorial force */
1704 fix1 = _mm_add_pd(fix1,tx);
1705 fiy1 = _mm_add_pd(fiy1,ty);
1706 fiz1 = _mm_add_pd(fiz1,tz);
1708 fjx1 = _mm_add_pd(fjx1,tx);
1709 fjy1 = _mm_add_pd(fjy1,ty);
1710 fjz1 = _mm_add_pd(fjz1,tz);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 /* REACTION-FIELD ELECTROSTATICS */
1717 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1721 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1723 /* Calculate temporary vectorial force */
1724 tx = _mm_mul_pd(fscal,dx12);
1725 ty = _mm_mul_pd(fscal,dy12);
1726 tz = _mm_mul_pd(fscal,dz12);
1728 /* Update vectorial force */
1729 fix1 = _mm_add_pd(fix1,tx);
1730 fiy1 = _mm_add_pd(fiy1,ty);
1731 fiz1 = _mm_add_pd(fiz1,tz);
1733 fjx2 = _mm_add_pd(fjx2,tx);
1734 fjy2 = _mm_add_pd(fjy2,ty);
1735 fjz2 = _mm_add_pd(fjz2,tz);
1737 /**************************
1738 * CALCULATE INTERACTIONS *
1739 **************************/
1741 /* REACTION-FIELD ELECTROSTATICS */
1742 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1746 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1748 /* Calculate temporary vectorial force */
1749 tx = _mm_mul_pd(fscal,dx20);
1750 ty = _mm_mul_pd(fscal,dy20);
1751 tz = _mm_mul_pd(fscal,dz20);
1753 /* Update vectorial force */
1754 fix2 = _mm_add_pd(fix2,tx);
1755 fiy2 = _mm_add_pd(fiy2,ty);
1756 fiz2 = _mm_add_pd(fiz2,tz);
1758 fjx0 = _mm_add_pd(fjx0,tx);
1759 fjy0 = _mm_add_pd(fjy0,ty);
1760 fjz0 = _mm_add_pd(fjz0,tz);
1762 /**************************
1763 * CALCULATE INTERACTIONS *
1764 **************************/
1766 /* REACTION-FIELD ELECTROSTATICS */
1767 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1771 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1773 /* Calculate temporary vectorial force */
1774 tx = _mm_mul_pd(fscal,dx21);
1775 ty = _mm_mul_pd(fscal,dy21);
1776 tz = _mm_mul_pd(fscal,dz21);
1778 /* Update vectorial force */
1779 fix2 = _mm_add_pd(fix2,tx);
1780 fiy2 = _mm_add_pd(fiy2,ty);
1781 fiz2 = _mm_add_pd(fiz2,tz);
1783 fjx1 = _mm_add_pd(fjx1,tx);
1784 fjy1 = _mm_add_pd(fjy1,ty);
1785 fjz1 = _mm_add_pd(fjz1,tz);
1787 /**************************
1788 * CALCULATE INTERACTIONS *
1789 **************************/
1791 /* REACTION-FIELD ELECTROSTATICS */
1792 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1796 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1798 /* Calculate temporary vectorial force */
1799 tx = _mm_mul_pd(fscal,dx22);
1800 ty = _mm_mul_pd(fscal,dy22);
1801 tz = _mm_mul_pd(fscal,dz22);
1803 /* Update vectorial force */
1804 fix2 = _mm_add_pd(fix2,tx);
1805 fiy2 = _mm_add_pd(fiy2,ty);
1806 fiz2 = _mm_add_pd(fiz2,tz);
1808 fjx2 = _mm_add_pd(fjx2,tx);
1809 fjy2 = _mm_add_pd(fjy2,ty);
1810 fjz2 = _mm_add_pd(fjz2,tz);
1812 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1814 /* Inner loop uses 270 flops */
1817 /* End of innermost loop */
1819 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1820 f+i_coord_offset,fshift+i_shift_offset);
1822 /* Increment number of inner iterations */
1823 inneriter += j_index_end - j_index_start;
1825 /* Outer loop uses 18 flops */
1828 /* Increment number of outer iterations */
1831 /* Update outer/inner flops */
1833 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*270);