2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "kernelutil_sparc64_hpc_ace_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int j_coord_offsetA,j_coord_offsetB;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 _fjsp_v2r8 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82 _fjsp_v2r8 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84 _fjsp_v2r8 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86 _fjsp_v2r8 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 _fjsp_v2r8 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89 int vdwjidx0A,vdwjidx0B;
90 _fjsp_v2r8 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 _fjsp_v2r8 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92 _fjsp_v2r8 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
93 _fjsp_v2r8 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
94 _fjsp_v2r8 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
95 _fjsp_v2r8 velec,felec,velecsum,facel,crf,krf,krf2;
98 _fjsp_v2r8 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101 _fjsp_v2r8 one_sixth = gmx_fjsp_set1_v2r8(1.0/6.0);
102 _fjsp_v2r8 one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
103 _fjsp_v2r8 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105 _fjsp_v2r8 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
106 real rswitch_scalar,d_scalar;
108 _fjsp_v2r8 dummy_mask,cutoff_mask;
109 _fjsp_v2r8 one = gmx_fjsp_set1_v2r8(1.0);
110 _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
111 union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = gmx_fjsp_set1_v2r8(fr->epsfac);
125 charge = mdatoms->chargeA;
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 sh_ewald = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
131 ewtab = fr->ic->tabq_coul_FDV0;
132 ewtabscale = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
133 ewtabhalfspace = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq1 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
138 iq2 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
139 iq3 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
140 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
142 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
143 rcutoff_scalar = fr->rcoulomb;
144 rcutoff = gmx_fjsp_set1_v2r8(rcutoff_scalar);
145 rcutoff2 = _fjsp_mul_v2r8(rcutoff,rcutoff);
147 rswitch_scalar = fr->rcoulomb_switch;
148 rswitch = gmx_fjsp_set1_v2r8(rswitch_scalar);
149 /* Setup switch parameters */
150 d_scalar = rcutoff_scalar-rswitch_scalar;
151 d = gmx_fjsp_set1_v2r8(d_scalar);
152 swV3 = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
153 swV4 = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
154 swV5 = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
155 swF2 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
156 swF3 = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
157 swF4 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
159 /* Avoid stupid compiler warnings */
167 /* Start outer loop over neighborlists */
168 for(iidx=0; iidx<nri; iidx++)
170 /* Load shift vector for this list */
171 i_shift_offset = DIM*shiftidx[iidx];
173 /* Load limits for loop over neighbors */
174 j_index_start = jindex[iidx];
175 j_index_end = jindex[iidx+1];
177 /* Get outer coordinate index */
179 i_coord_offset = DIM*inr;
181 /* Load i particle coords and add shift vector */
182 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
183 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
185 fix0 = _fjsp_setzero_v2r8();
186 fiy0 = _fjsp_setzero_v2r8();
187 fiz0 = _fjsp_setzero_v2r8();
188 fix1 = _fjsp_setzero_v2r8();
189 fiy1 = _fjsp_setzero_v2r8();
190 fiz1 = _fjsp_setzero_v2r8();
191 fix2 = _fjsp_setzero_v2r8();
192 fiy2 = _fjsp_setzero_v2r8();
193 fiz2 = _fjsp_setzero_v2r8();
194 fix3 = _fjsp_setzero_v2r8();
195 fiy3 = _fjsp_setzero_v2r8();
196 fiz3 = _fjsp_setzero_v2r8();
198 /* Reset potential sums */
199 velecsum = _fjsp_setzero_v2r8();
200 vvdwsum = _fjsp_setzero_v2r8();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
206 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
212 /* load j atom coordinates */
213 gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
216 /* Calculate displacement vector */
217 dx00 = _fjsp_sub_v2r8(ix0,jx0);
218 dy00 = _fjsp_sub_v2r8(iy0,jy0);
219 dz00 = _fjsp_sub_v2r8(iz0,jz0);
220 dx10 = _fjsp_sub_v2r8(ix1,jx0);
221 dy10 = _fjsp_sub_v2r8(iy1,jy0);
222 dz10 = _fjsp_sub_v2r8(iz1,jz0);
223 dx20 = _fjsp_sub_v2r8(ix2,jx0);
224 dy20 = _fjsp_sub_v2r8(iy2,jy0);
225 dz20 = _fjsp_sub_v2r8(iz2,jz0);
226 dx30 = _fjsp_sub_v2r8(ix3,jx0);
227 dy30 = _fjsp_sub_v2r8(iy3,jy0);
228 dz30 = _fjsp_sub_v2r8(iz3,jz0);
230 /* Calculate squared distance and things based on it */
231 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
232 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
233 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
234 rsq30 = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
236 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
237 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
238 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
239 rinv30 = gmx_fjsp_invsqrt_v2r8(rsq30);
241 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
242 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
243 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
244 rinvsq30 = _fjsp_mul_v2r8(rinv30,rinv30);
246 /* Load parameters for j particles */
247 jq0 = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
248 vdwjidx0A = 2*vdwtype[jnrA+0];
249 vdwjidx0B = 2*vdwtype[jnrB+0];
251 fjx0 = _fjsp_setzero_v2r8();
252 fjy0 = _fjsp_setzero_v2r8();
253 fjz0 = _fjsp_setzero_v2r8();
255 /**************************
256 * CALCULATE INTERACTIONS *
257 **************************/
259 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
262 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
264 /* Compute parameters for interactions between i and j atoms */
265 gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
266 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
268 /* LENNARD-JONES DISPERSION/REPULSION */
270 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
271 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
272 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
273 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
274 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
276 d = _fjsp_sub_v2r8(r00,rswitch);
277 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
278 d2 = _fjsp_mul_v2r8(d,d);
279 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
281 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
283 /* Evaluate switch function */
284 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
285 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
286 vvdw = _fjsp_mul_v2r8(vvdw,sw);
287 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
289 /* Update potential sum for this i atom from the interaction with this j atom. */
290 vvdw = _fjsp_and_v2r8(vvdw,cutoff_mask);
291 vvdwsum = _fjsp_add_v2r8(vvdwsum,vvdw);
295 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
297 /* Update vectorial force */
298 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
299 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
300 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
302 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
303 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
304 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
315 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
317 /* Compute parameters for interactions between i and j atoms */
318 qq10 = _fjsp_mul_v2r8(iq1,jq0);
320 /* EWALD ELECTROSTATICS */
322 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
323 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
324 itab_tmp = _fjsp_dtox_v2r8(ewrt);
325 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
326 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
328 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
329 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
330 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
331 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
332 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
333 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
334 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
335 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
336 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
337 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
339 d = _fjsp_sub_v2r8(r10,rswitch);
340 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
341 d2 = _fjsp_mul_v2r8(d,d);
342 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
344 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
346 /* Evaluate switch function */
347 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
349 velec = _fjsp_mul_v2r8(velec,sw);
350 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 velec = _fjsp_and_v2r8(velec,cutoff_mask);
354 velecsum = _fjsp_add_v2r8(velecsum,velec);
358 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
360 /* Update vectorial force */
361 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
362 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
363 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
365 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
366 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
367 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
378 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
380 /* Compute parameters for interactions between i and j atoms */
381 qq20 = _fjsp_mul_v2r8(iq2,jq0);
383 /* EWALD ELECTROSTATICS */
385 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
386 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
387 itab_tmp = _fjsp_dtox_v2r8(ewrt);
388 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
389 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
391 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
392 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
393 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
394 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
395 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
396 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
397 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
398 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
399 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
400 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
402 d = _fjsp_sub_v2r8(r20,rswitch);
403 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
404 d2 = _fjsp_mul_v2r8(d,d);
405 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
407 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
409 /* Evaluate switch function */
410 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
411 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
412 velec = _fjsp_mul_v2r8(velec,sw);
413 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
415 /* Update potential sum for this i atom from the interaction with this j atom. */
416 velec = _fjsp_and_v2r8(velec,cutoff_mask);
417 velecsum = _fjsp_add_v2r8(velecsum,velec);
421 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
423 /* Update vectorial force */
424 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
425 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
426 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
428 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
429 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
430 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
434 /**************************
435 * CALCULATE INTERACTIONS *
436 **************************/
438 if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
441 r30 = _fjsp_mul_v2r8(rsq30,rinv30);
443 /* Compute parameters for interactions between i and j atoms */
444 qq30 = _fjsp_mul_v2r8(iq3,jq0);
446 /* EWALD ELECTROSTATICS */
448 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
449 ewrt = _fjsp_mul_v2r8(r30,ewtabscale);
450 itab_tmp = _fjsp_dtox_v2r8(ewrt);
451 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
452 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
454 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
455 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
456 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
457 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
458 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
459 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
460 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
461 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
462 velec = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
463 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
465 d = _fjsp_sub_v2r8(r30,rswitch);
466 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
467 d2 = _fjsp_mul_v2r8(d,d);
468 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
470 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
472 /* Evaluate switch function */
473 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
474 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
475 velec = _fjsp_mul_v2r8(velec,sw);
476 cutoff_mask = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
478 /* Update potential sum for this i atom from the interaction with this j atom. */
479 velec = _fjsp_and_v2r8(velec,cutoff_mask);
480 velecsum = _fjsp_add_v2r8(velecsum,velec);
484 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
486 /* Update vectorial force */
487 fix3 = _fjsp_madd_v2r8(dx30,fscal,fix3);
488 fiy3 = _fjsp_madd_v2r8(dy30,fscal,fiy3);
489 fiz3 = _fjsp_madd_v2r8(dz30,fscal,fiz3);
491 fjx0 = _fjsp_madd_v2r8(dx30,fscal,fjx0);
492 fjy0 = _fjsp_madd_v2r8(dy30,fscal,fjy0);
493 fjz0 = _fjsp_madd_v2r8(dz30,fscal,fjz0);
497 gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
499 /* Inner loop uses 269 flops */
506 j_coord_offsetA = DIM*jnrA;
508 /* load j atom coordinates */
509 gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
512 /* Calculate displacement vector */
513 dx00 = _fjsp_sub_v2r8(ix0,jx0);
514 dy00 = _fjsp_sub_v2r8(iy0,jy0);
515 dz00 = _fjsp_sub_v2r8(iz0,jz0);
516 dx10 = _fjsp_sub_v2r8(ix1,jx0);
517 dy10 = _fjsp_sub_v2r8(iy1,jy0);
518 dz10 = _fjsp_sub_v2r8(iz1,jz0);
519 dx20 = _fjsp_sub_v2r8(ix2,jx0);
520 dy20 = _fjsp_sub_v2r8(iy2,jy0);
521 dz20 = _fjsp_sub_v2r8(iz2,jz0);
522 dx30 = _fjsp_sub_v2r8(ix3,jx0);
523 dy30 = _fjsp_sub_v2r8(iy3,jy0);
524 dz30 = _fjsp_sub_v2r8(iz3,jz0);
526 /* Calculate squared distance and things based on it */
527 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
528 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
529 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
530 rsq30 = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
532 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
533 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
534 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
535 rinv30 = gmx_fjsp_invsqrt_v2r8(rsq30);
537 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
538 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
539 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
540 rinvsq30 = _fjsp_mul_v2r8(rinv30,rinv30);
542 /* Load parameters for j particles */
543 jq0 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
544 vdwjidx0A = 2*vdwtype[jnrA+0];
546 fjx0 = _fjsp_setzero_v2r8();
547 fjy0 = _fjsp_setzero_v2r8();
548 fjz0 = _fjsp_setzero_v2r8();
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
557 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
559 /* Compute parameters for interactions between i and j atoms */
560 gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
561 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
563 /* LENNARD-JONES DISPERSION/REPULSION */
565 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
566 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
567 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
568 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
569 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
571 d = _fjsp_sub_v2r8(r00,rswitch);
572 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
573 d2 = _fjsp_mul_v2r8(d,d);
574 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
576 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
578 /* Evaluate switch function */
579 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
580 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
581 vvdw = _fjsp_mul_v2r8(vvdw,sw);
582 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 vvdw = _fjsp_and_v2r8(vvdw,cutoff_mask);
586 vvdw = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
587 vvdwsum = _fjsp_add_v2r8(vvdwsum,vvdw);
591 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
593 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
595 /* Update vectorial force */
596 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
597 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
598 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
600 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
601 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
602 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
606 /**************************
607 * CALCULATE INTERACTIONS *
608 **************************/
610 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
613 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
615 /* Compute parameters for interactions between i and j atoms */
616 qq10 = _fjsp_mul_v2r8(iq1,jq0);
618 /* EWALD ELECTROSTATICS */
620 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
621 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
622 itab_tmp = _fjsp_dtox_v2r8(ewrt);
623 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
624 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
626 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
627 ewtabD = _fjsp_setzero_v2r8();
628 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
629 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
630 ewtabFn = _fjsp_setzero_v2r8();
631 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
632 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
633 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
634 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
635 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
637 d = _fjsp_sub_v2r8(r10,rswitch);
638 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
639 d2 = _fjsp_mul_v2r8(d,d);
640 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
642 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
644 /* Evaluate switch function */
645 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
646 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
647 velec = _fjsp_mul_v2r8(velec,sw);
648 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velec = _fjsp_and_v2r8(velec,cutoff_mask);
652 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
653 velecsum = _fjsp_add_v2r8(velecsum,velec);
657 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
659 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
661 /* Update vectorial force */
662 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
663 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
664 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
666 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
667 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
668 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
672 /**************************
673 * CALCULATE INTERACTIONS *
674 **************************/
676 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
679 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
681 /* Compute parameters for interactions between i and j atoms */
682 qq20 = _fjsp_mul_v2r8(iq2,jq0);
684 /* EWALD ELECTROSTATICS */
686 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
687 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
688 itab_tmp = _fjsp_dtox_v2r8(ewrt);
689 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
690 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
692 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
693 ewtabD = _fjsp_setzero_v2r8();
694 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
695 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
696 ewtabFn = _fjsp_setzero_v2r8();
697 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
698 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
699 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
700 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
701 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
703 d = _fjsp_sub_v2r8(r20,rswitch);
704 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
705 d2 = _fjsp_mul_v2r8(d,d);
706 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
708 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
710 /* Evaluate switch function */
711 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
712 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
713 velec = _fjsp_mul_v2r8(velec,sw);
714 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
716 /* Update potential sum for this i atom from the interaction with this j atom. */
717 velec = _fjsp_and_v2r8(velec,cutoff_mask);
718 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
719 velecsum = _fjsp_add_v2r8(velecsum,velec);
723 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
725 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
727 /* Update vectorial force */
728 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
729 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
730 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
732 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
733 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
734 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
738 /**************************
739 * CALCULATE INTERACTIONS *
740 **************************/
742 if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
745 r30 = _fjsp_mul_v2r8(rsq30,rinv30);
747 /* Compute parameters for interactions between i and j atoms */
748 qq30 = _fjsp_mul_v2r8(iq3,jq0);
750 /* EWALD ELECTROSTATICS */
752 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
753 ewrt = _fjsp_mul_v2r8(r30,ewtabscale);
754 itab_tmp = _fjsp_dtox_v2r8(ewrt);
755 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
756 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
758 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
759 ewtabD = _fjsp_setzero_v2r8();
760 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
761 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
762 ewtabFn = _fjsp_setzero_v2r8();
763 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
764 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
765 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
766 velec = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
767 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
769 d = _fjsp_sub_v2r8(r30,rswitch);
770 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
771 d2 = _fjsp_mul_v2r8(d,d);
772 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
774 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
776 /* Evaluate switch function */
777 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
778 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
779 velec = _fjsp_mul_v2r8(velec,sw);
780 cutoff_mask = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
782 /* Update potential sum for this i atom from the interaction with this j atom. */
783 velec = _fjsp_and_v2r8(velec,cutoff_mask);
784 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
785 velecsum = _fjsp_add_v2r8(velecsum,velec);
789 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
791 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
793 /* Update vectorial force */
794 fix3 = _fjsp_madd_v2r8(dx30,fscal,fix3);
795 fiy3 = _fjsp_madd_v2r8(dy30,fscal,fiy3);
796 fiz3 = _fjsp_madd_v2r8(dz30,fscal,fiz3);
798 fjx0 = _fjsp_madd_v2r8(dx30,fscal,fjx0);
799 fjy0 = _fjsp_madd_v2r8(dy30,fscal,fjy0);
800 fjz0 = _fjsp_madd_v2r8(dz30,fscal,fjz0);
804 gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
806 /* Inner loop uses 269 flops */
809 /* End of innermost loop */
811 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
812 f+i_coord_offset,fshift+i_shift_offset);
815 /* Update potential energies */
816 gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
817 gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
819 /* Increment number of inner iterations */
820 inneriter += j_index_end - j_index_start;
822 /* Outer loop uses 26 flops */
825 /* Increment number of outer iterations */
828 /* Update outer/inner flops */
830 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*269);
833 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
834 * Electrostatics interaction: Ewald
835 * VdW interaction: LennardJones
836 * Geometry: Water4-Particle
837 * Calculate force/pot: Force
840 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
841 (t_nblist * gmx_restrict nlist,
842 rvec * gmx_restrict xx,
843 rvec * gmx_restrict ff,
844 t_forcerec * gmx_restrict fr,
845 t_mdatoms * gmx_restrict mdatoms,
846 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
847 t_nrnb * gmx_restrict nrnb)
849 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
850 * just 0 for non-waters.
851 * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
852 * jnr indices corresponding to data put in the four positions in the SIMD register.
854 int i_shift_offset,i_coord_offset,outeriter,inneriter;
855 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
857 int j_coord_offsetA,j_coord_offsetB;
858 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
860 real *shiftvec,*fshift,*x,*f;
861 _fjsp_v2r8 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
863 _fjsp_v2r8 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
865 _fjsp_v2r8 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
867 _fjsp_v2r8 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
869 _fjsp_v2r8 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
870 int vdwjidx0A,vdwjidx0B;
871 _fjsp_v2r8 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
872 _fjsp_v2r8 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
873 _fjsp_v2r8 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
874 _fjsp_v2r8 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
875 _fjsp_v2r8 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
876 _fjsp_v2r8 velec,felec,velecsum,facel,crf,krf,krf2;
879 _fjsp_v2r8 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
882 _fjsp_v2r8 one_sixth = gmx_fjsp_set1_v2r8(1.0/6.0);
883 _fjsp_v2r8 one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
884 _fjsp_v2r8 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
886 _fjsp_v2r8 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
887 real rswitch_scalar,d_scalar;
889 _fjsp_v2r8 dummy_mask,cutoff_mask;
890 _fjsp_v2r8 one = gmx_fjsp_set1_v2r8(1.0);
891 _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
892 union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
899 jindex = nlist->jindex;
901 shiftidx = nlist->shift;
903 shiftvec = fr->shift_vec[0];
904 fshift = fr->fshift[0];
905 facel = gmx_fjsp_set1_v2r8(fr->epsfac);
906 charge = mdatoms->chargeA;
907 nvdwtype = fr->ntype;
909 vdwtype = mdatoms->typeA;
911 sh_ewald = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
912 ewtab = fr->ic->tabq_coul_FDV0;
913 ewtabscale = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
914 ewtabhalfspace = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
916 /* Setup water-specific parameters */
917 inr = nlist->iinr[0];
918 iq1 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
919 iq2 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
920 iq3 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
921 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
923 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
924 rcutoff_scalar = fr->rcoulomb;
925 rcutoff = gmx_fjsp_set1_v2r8(rcutoff_scalar);
926 rcutoff2 = _fjsp_mul_v2r8(rcutoff,rcutoff);
928 rswitch_scalar = fr->rcoulomb_switch;
929 rswitch = gmx_fjsp_set1_v2r8(rswitch_scalar);
930 /* Setup switch parameters */
931 d_scalar = rcutoff_scalar-rswitch_scalar;
932 d = gmx_fjsp_set1_v2r8(d_scalar);
933 swV3 = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
934 swV4 = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
935 swV5 = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
936 swF2 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
937 swF3 = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
938 swF4 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
940 /* Avoid stupid compiler warnings */
948 /* Start outer loop over neighborlists */
949 for(iidx=0; iidx<nri; iidx++)
951 /* Load shift vector for this list */
952 i_shift_offset = DIM*shiftidx[iidx];
954 /* Load limits for loop over neighbors */
955 j_index_start = jindex[iidx];
956 j_index_end = jindex[iidx+1];
958 /* Get outer coordinate index */
960 i_coord_offset = DIM*inr;
962 /* Load i particle coords and add shift vector */
963 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
964 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
966 fix0 = _fjsp_setzero_v2r8();
967 fiy0 = _fjsp_setzero_v2r8();
968 fiz0 = _fjsp_setzero_v2r8();
969 fix1 = _fjsp_setzero_v2r8();
970 fiy1 = _fjsp_setzero_v2r8();
971 fiz1 = _fjsp_setzero_v2r8();
972 fix2 = _fjsp_setzero_v2r8();
973 fiy2 = _fjsp_setzero_v2r8();
974 fiz2 = _fjsp_setzero_v2r8();
975 fix3 = _fjsp_setzero_v2r8();
976 fiy3 = _fjsp_setzero_v2r8();
977 fiz3 = _fjsp_setzero_v2r8();
979 /* Start inner kernel loop */
980 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
983 /* Get j neighbor index, and coordinate index */
986 j_coord_offsetA = DIM*jnrA;
987 j_coord_offsetB = DIM*jnrB;
989 /* load j atom coordinates */
990 gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
993 /* Calculate displacement vector */
994 dx00 = _fjsp_sub_v2r8(ix0,jx0);
995 dy00 = _fjsp_sub_v2r8(iy0,jy0);
996 dz00 = _fjsp_sub_v2r8(iz0,jz0);
997 dx10 = _fjsp_sub_v2r8(ix1,jx0);
998 dy10 = _fjsp_sub_v2r8(iy1,jy0);
999 dz10 = _fjsp_sub_v2r8(iz1,jz0);
1000 dx20 = _fjsp_sub_v2r8(ix2,jx0);
1001 dy20 = _fjsp_sub_v2r8(iy2,jy0);
1002 dz20 = _fjsp_sub_v2r8(iz2,jz0);
1003 dx30 = _fjsp_sub_v2r8(ix3,jx0);
1004 dy30 = _fjsp_sub_v2r8(iy3,jy0);
1005 dz30 = _fjsp_sub_v2r8(iz3,jz0);
1007 /* Calculate squared distance and things based on it */
1008 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1009 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1010 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1011 rsq30 = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
1013 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
1014 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
1015 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
1016 rinv30 = gmx_fjsp_invsqrt_v2r8(rsq30);
1018 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
1019 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
1020 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
1021 rinvsq30 = _fjsp_mul_v2r8(rinv30,rinv30);
1023 /* Load parameters for j particles */
1024 jq0 = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
1025 vdwjidx0A = 2*vdwtype[jnrA+0];
1026 vdwjidx0B = 2*vdwtype[jnrB+0];
1028 fjx0 = _fjsp_setzero_v2r8();
1029 fjy0 = _fjsp_setzero_v2r8();
1030 fjz0 = _fjsp_setzero_v2r8();
1032 /**************************
1033 * CALCULATE INTERACTIONS *
1034 **************************/
1036 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1039 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
1041 /* Compute parameters for interactions between i and j atoms */
1042 gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
1043 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
1045 /* LENNARD-JONES DISPERSION/REPULSION */
1047 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1048 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
1049 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
1050 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
1051 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
1053 d = _fjsp_sub_v2r8(r00,rswitch);
1054 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1055 d2 = _fjsp_mul_v2r8(d,d);
1056 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1058 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1060 /* Evaluate switch function */
1061 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1062 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
1063 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1067 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1069 /* Update vectorial force */
1070 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
1071 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1072 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1074 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1075 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1076 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1087 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
1089 /* Compute parameters for interactions between i and j atoms */
1090 qq10 = _fjsp_mul_v2r8(iq1,jq0);
1092 /* EWALD ELECTROSTATICS */
1094 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1095 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
1096 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1097 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1098 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1100 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1101 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1102 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1103 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1104 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1105 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1106 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1107 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1108 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
1109 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1111 d = _fjsp_sub_v2r8(r10,rswitch);
1112 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1113 d2 = _fjsp_mul_v2r8(d,d);
1114 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1116 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1118 /* Evaluate switch function */
1119 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1120 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
1121 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
1125 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1127 /* Update vectorial force */
1128 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
1129 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1130 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1132 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1133 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1134 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
1145 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
1147 /* Compute parameters for interactions between i and j atoms */
1148 qq20 = _fjsp_mul_v2r8(iq2,jq0);
1150 /* EWALD ELECTROSTATICS */
1152 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1153 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
1154 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1155 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1156 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1158 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1159 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1160 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1161 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1162 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1163 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1164 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1165 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1166 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
1167 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1169 d = _fjsp_sub_v2r8(r20,rswitch);
1170 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1171 d2 = _fjsp_mul_v2r8(d,d);
1172 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1174 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1176 /* Evaluate switch function */
1177 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1178 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
1179 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
1183 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1185 /* Update vectorial force */
1186 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
1187 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1188 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1190 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1191 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1192 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1196 /**************************
1197 * CALCULATE INTERACTIONS *
1198 **************************/
1200 if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
1203 r30 = _fjsp_mul_v2r8(rsq30,rinv30);
1205 /* Compute parameters for interactions between i and j atoms */
1206 qq30 = _fjsp_mul_v2r8(iq3,jq0);
1208 /* EWALD ELECTROSTATICS */
1210 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1211 ewrt = _fjsp_mul_v2r8(r30,ewtabscale);
1212 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1213 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1214 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1216 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1217 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1218 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1219 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1220 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1221 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1222 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1223 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1224 velec = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
1225 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
1227 d = _fjsp_sub_v2r8(r30,rswitch);
1228 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1229 d2 = _fjsp_mul_v2r8(d,d);
1230 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1232 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1234 /* Evaluate switch function */
1235 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1236 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
1237 cutoff_mask = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
1241 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1243 /* Update vectorial force */
1244 fix3 = _fjsp_madd_v2r8(dx30,fscal,fix3);
1245 fiy3 = _fjsp_madd_v2r8(dy30,fscal,fiy3);
1246 fiz3 = _fjsp_madd_v2r8(dz30,fscal,fiz3);
1248 fjx0 = _fjsp_madd_v2r8(dx30,fscal,fjx0);
1249 fjy0 = _fjsp_madd_v2r8(dy30,fscal,fjy0);
1250 fjz0 = _fjsp_madd_v2r8(dz30,fscal,fjz0);
1254 gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1256 /* Inner loop uses 257 flops */
1259 if(jidx<j_index_end)
1263 j_coord_offsetA = DIM*jnrA;
1265 /* load j atom coordinates */
1266 gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
1269 /* Calculate displacement vector */
1270 dx00 = _fjsp_sub_v2r8(ix0,jx0);
1271 dy00 = _fjsp_sub_v2r8(iy0,jy0);
1272 dz00 = _fjsp_sub_v2r8(iz0,jz0);
1273 dx10 = _fjsp_sub_v2r8(ix1,jx0);
1274 dy10 = _fjsp_sub_v2r8(iy1,jy0);
1275 dz10 = _fjsp_sub_v2r8(iz1,jz0);
1276 dx20 = _fjsp_sub_v2r8(ix2,jx0);
1277 dy20 = _fjsp_sub_v2r8(iy2,jy0);
1278 dz20 = _fjsp_sub_v2r8(iz2,jz0);
1279 dx30 = _fjsp_sub_v2r8(ix3,jx0);
1280 dy30 = _fjsp_sub_v2r8(iy3,jy0);
1281 dz30 = _fjsp_sub_v2r8(iz3,jz0);
1283 /* Calculate squared distance and things based on it */
1284 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1285 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1286 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1287 rsq30 = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
1289 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
1290 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
1291 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
1292 rinv30 = gmx_fjsp_invsqrt_v2r8(rsq30);
1294 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
1295 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
1296 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
1297 rinvsq30 = _fjsp_mul_v2r8(rinv30,rinv30);
1299 /* Load parameters for j particles */
1300 jq0 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
1301 vdwjidx0A = 2*vdwtype[jnrA+0];
1303 fjx0 = _fjsp_setzero_v2r8();
1304 fjy0 = _fjsp_setzero_v2r8();
1305 fjz0 = _fjsp_setzero_v2r8();
1307 /**************************
1308 * CALCULATE INTERACTIONS *
1309 **************************/
1311 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1314 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
1316 /* Compute parameters for interactions between i and j atoms */
1317 gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
1318 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
1320 /* LENNARD-JONES DISPERSION/REPULSION */
1322 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1323 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
1324 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
1325 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
1326 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
1328 d = _fjsp_sub_v2r8(r00,rswitch);
1329 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1330 d2 = _fjsp_mul_v2r8(d,d);
1331 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1333 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1335 /* Evaluate switch function */
1336 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1337 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
1338 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1342 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1344 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1346 /* Update vectorial force */
1347 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
1348 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1349 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1351 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1352 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1353 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1357 /**************************
1358 * CALCULATE INTERACTIONS *
1359 **************************/
1361 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1364 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
1366 /* Compute parameters for interactions between i and j atoms */
1367 qq10 = _fjsp_mul_v2r8(iq1,jq0);
1369 /* EWALD ELECTROSTATICS */
1371 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1372 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
1373 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1374 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1375 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1377 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1378 ewtabD = _fjsp_setzero_v2r8();
1379 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1380 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1381 ewtabFn = _fjsp_setzero_v2r8();
1382 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1383 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1384 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1385 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
1386 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1388 d = _fjsp_sub_v2r8(r10,rswitch);
1389 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1390 d2 = _fjsp_mul_v2r8(d,d);
1391 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1393 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1395 /* Evaluate switch function */
1396 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1397 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
1398 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
1402 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1404 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1406 /* Update vectorial force */
1407 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
1408 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1409 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1411 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1412 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1413 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1417 /**************************
1418 * CALCULATE INTERACTIONS *
1419 **************************/
1421 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
1424 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
1426 /* Compute parameters for interactions between i and j atoms */
1427 qq20 = _fjsp_mul_v2r8(iq2,jq0);
1429 /* EWALD ELECTROSTATICS */
1431 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1432 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
1433 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1434 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1435 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1437 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1438 ewtabD = _fjsp_setzero_v2r8();
1439 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1440 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1441 ewtabFn = _fjsp_setzero_v2r8();
1442 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1443 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1444 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1445 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
1446 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1448 d = _fjsp_sub_v2r8(r20,rswitch);
1449 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1450 d2 = _fjsp_mul_v2r8(d,d);
1451 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1453 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1455 /* Evaluate switch function */
1456 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1457 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
1458 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
1462 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1464 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1466 /* Update vectorial force */
1467 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
1468 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1469 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1471 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1472 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1473 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1477 /**************************
1478 * CALCULATE INTERACTIONS *
1479 **************************/
1481 if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
1484 r30 = _fjsp_mul_v2r8(rsq30,rinv30);
1486 /* Compute parameters for interactions between i and j atoms */
1487 qq30 = _fjsp_mul_v2r8(iq3,jq0);
1489 /* EWALD ELECTROSTATICS */
1491 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1492 ewrt = _fjsp_mul_v2r8(r30,ewtabscale);
1493 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1494 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1495 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1497 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1498 ewtabD = _fjsp_setzero_v2r8();
1499 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1500 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1501 ewtabFn = _fjsp_setzero_v2r8();
1502 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1503 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1504 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1505 velec = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
1506 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
1508 d = _fjsp_sub_v2r8(r30,rswitch);
1509 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1510 d2 = _fjsp_mul_v2r8(d,d);
1511 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1513 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1515 /* Evaluate switch function */
1516 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1517 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
1518 cutoff_mask = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
1522 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1524 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1526 /* Update vectorial force */
1527 fix3 = _fjsp_madd_v2r8(dx30,fscal,fix3);
1528 fiy3 = _fjsp_madd_v2r8(dy30,fscal,fiy3);
1529 fiz3 = _fjsp_madd_v2r8(dz30,fscal,fiz3);
1531 fjx0 = _fjsp_madd_v2r8(dx30,fscal,fjx0);
1532 fjy0 = _fjsp_madd_v2r8(dy30,fscal,fjy0);
1533 fjz0 = _fjsp_madd_v2r8(dz30,fscal,fjz0);
1537 gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1539 /* Inner loop uses 257 flops */
1542 /* End of innermost loop */
1544 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1545 f+i_coord_offset,fshift+i_shift_offset);
1547 /* Increment number of inner iterations */
1548 inneriter += j_index_end - j_index_start;
1550 /* Outer loop uses 24 flops */
1553 /* Increment number of outer iterations */
1556 /* Update outer/inner flops */
1558 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*257);