2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012, by the GROMACS development team, led by
5 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
6 * others, as listed in the AUTHORS file in the top-level source
7 * directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "kernelutil_sparc64_hpc_ace_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int j_coord_offsetA,j_coord_offsetB;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 _fjsp_v2r8 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82 _fjsp_v2r8 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84 _fjsp_v2r8 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86 _fjsp_v2r8 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 int vdwjidx0A,vdwjidx0B;
88 _fjsp_v2r8 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 _fjsp_v2r8 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 _fjsp_v2r8 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 _fjsp_v2r8 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94 _fjsp_v2r8 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
95 _fjsp_v2r8 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
96 _fjsp_v2r8 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97 _fjsp_v2r8 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 _fjsp_v2r8 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 _fjsp_v2r8 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
100 _fjsp_v2r8 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 _fjsp_v2r8 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 _fjsp_v2r8 velec,felec,velecsum,facel,crf,krf,krf2;
105 _fjsp_v2r8 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108 _fjsp_v2r8 one_sixth = gmx_fjsp_set1_v2r8(1.0/6.0);
109 _fjsp_v2r8 one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
110 _fjsp_v2r8 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
112 _fjsp_v2r8 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
113 real rswitch_scalar,d_scalar;
115 _fjsp_v2r8 dummy_mask,cutoff_mask;
116 _fjsp_v2r8 one = gmx_fjsp_set1_v2r8(1.0);
117 _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
118 union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = gmx_fjsp_set1_v2r8(fr->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
137 sh_ewald = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
138 ewtab = fr->ic->tabq_coul_FDV0;
139 ewtabscale = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
140 ewtabhalfspace = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq0 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
145 iq1 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
146 iq2 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
147 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
149 jq0 = gmx_fjsp_set1_v2r8(charge[inr+0]);
150 jq1 = gmx_fjsp_set1_v2r8(charge[inr+1]);
151 jq2 = gmx_fjsp_set1_v2r8(charge[inr+2]);
152 vdwjidx0A = 2*vdwtype[inr+0];
153 qq00 = _fjsp_mul_v2r8(iq0,jq0);
154 c6_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
155 c12_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
156 qq01 = _fjsp_mul_v2r8(iq0,jq1);
157 qq02 = _fjsp_mul_v2r8(iq0,jq2);
158 qq10 = _fjsp_mul_v2r8(iq1,jq0);
159 qq11 = _fjsp_mul_v2r8(iq1,jq1);
160 qq12 = _fjsp_mul_v2r8(iq1,jq2);
161 qq20 = _fjsp_mul_v2r8(iq2,jq0);
162 qq21 = _fjsp_mul_v2r8(iq2,jq1);
163 qq22 = _fjsp_mul_v2r8(iq2,jq2);
165 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166 rcutoff_scalar = fr->rcoulomb;
167 rcutoff = gmx_fjsp_set1_v2r8(rcutoff_scalar);
168 rcutoff2 = _fjsp_mul_v2r8(rcutoff,rcutoff);
170 rswitch_scalar = fr->rcoulomb_switch;
171 rswitch = gmx_fjsp_set1_v2r8(rswitch_scalar);
172 /* Setup switch parameters */
173 d_scalar = rcutoff_scalar-rswitch_scalar;
174 d = gmx_fjsp_set1_v2r8(d_scalar);
175 swV3 = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
176 swV4 = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177 swV5 = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
178 swF2 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
179 swF3 = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180 swF4 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
182 /* Avoid stupid compiler warnings */
190 /* Start outer loop over neighborlists */
191 for(iidx=0; iidx<nri; iidx++)
193 /* Load shift vector for this list */
194 i_shift_offset = DIM*shiftidx[iidx];
196 /* Load limits for loop over neighbors */
197 j_index_start = jindex[iidx];
198 j_index_end = jindex[iidx+1];
200 /* Get outer coordinate index */
202 i_coord_offset = DIM*inr;
204 /* Load i particle coords and add shift vector */
205 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
206 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
208 fix0 = _fjsp_setzero_v2r8();
209 fiy0 = _fjsp_setzero_v2r8();
210 fiz0 = _fjsp_setzero_v2r8();
211 fix1 = _fjsp_setzero_v2r8();
212 fiy1 = _fjsp_setzero_v2r8();
213 fiz1 = _fjsp_setzero_v2r8();
214 fix2 = _fjsp_setzero_v2r8();
215 fiy2 = _fjsp_setzero_v2r8();
216 fiz2 = _fjsp_setzero_v2r8();
218 /* Reset potential sums */
219 velecsum = _fjsp_setzero_v2r8();
220 vvdwsum = _fjsp_setzero_v2r8();
222 /* Start inner kernel loop */
223 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
226 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA = DIM*jnrA;
230 j_coord_offsetB = DIM*jnrB;
232 /* load j atom coordinates */
233 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
234 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
236 /* Calculate displacement vector */
237 dx00 = _fjsp_sub_v2r8(ix0,jx0);
238 dy00 = _fjsp_sub_v2r8(iy0,jy0);
239 dz00 = _fjsp_sub_v2r8(iz0,jz0);
240 dx01 = _fjsp_sub_v2r8(ix0,jx1);
241 dy01 = _fjsp_sub_v2r8(iy0,jy1);
242 dz01 = _fjsp_sub_v2r8(iz0,jz1);
243 dx02 = _fjsp_sub_v2r8(ix0,jx2);
244 dy02 = _fjsp_sub_v2r8(iy0,jy2);
245 dz02 = _fjsp_sub_v2r8(iz0,jz2);
246 dx10 = _fjsp_sub_v2r8(ix1,jx0);
247 dy10 = _fjsp_sub_v2r8(iy1,jy0);
248 dz10 = _fjsp_sub_v2r8(iz1,jz0);
249 dx11 = _fjsp_sub_v2r8(ix1,jx1);
250 dy11 = _fjsp_sub_v2r8(iy1,jy1);
251 dz11 = _fjsp_sub_v2r8(iz1,jz1);
252 dx12 = _fjsp_sub_v2r8(ix1,jx2);
253 dy12 = _fjsp_sub_v2r8(iy1,jy2);
254 dz12 = _fjsp_sub_v2r8(iz1,jz2);
255 dx20 = _fjsp_sub_v2r8(ix2,jx0);
256 dy20 = _fjsp_sub_v2r8(iy2,jy0);
257 dz20 = _fjsp_sub_v2r8(iz2,jz0);
258 dx21 = _fjsp_sub_v2r8(ix2,jx1);
259 dy21 = _fjsp_sub_v2r8(iy2,jy1);
260 dz21 = _fjsp_sub_v2r8(iz2,jz1);
261 dx22 = _fjsp_sub_v2r8(ix2,jx2);
262 dy22 = _fjsp_sub_v2r8(iy2,jy2);
263 dz22 = _fjsp_sub_v2r8(iz2,jz2);
265 /* Calculate squared distance and things based on it */
266 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
267 rsq01 = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
268 rsq02 = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
269 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
270 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
271 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
272 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
273 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
274 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
276 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
277 rinv01 = gmx_fjsp_invsqrt_v2r8(rsq01);
278 rinv02 = gmx_fjsp_invsqrt_v2r8(rsq02);
279 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
280 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
281 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
282 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
283 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
284 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
286 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
287 rinvsq01 = _fjsp_mul_v2r8(rinv01,rinv01);
288 rinvsq02 = _fjsp_mul_v2r8(rinv02,rinv02);
289 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
290 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
291 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
292 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
293 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
294 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
296 fjx0 = _fjsp_setzero_v2r8();
297 fjy0 = _fjsp_setzero_v2r8();
298 fjz0 = _fjsp_setzero_v2r8();
299 fjx1 = _fjsp_setzero_v2r8();
300 fjy1 = _fjsp_setzero_v2r8();
301 fjz1 = _fjsp_setzero_v2r8();
302 fjx2 = _fjsp_setzero_v2r8();
303 fjy2 = _fjsp_setzero_v2r8();
304 fjz2 = _fjsp_setzero_v2r8();
306 /**************************
307 * CALCULATE INTERACTIONS *
308 **************************/
310 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
313 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
315 /* EWALD ELECTROSTATICS */
317 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
318 ewrt = _fjsp_mul_v2r8(r00,ewtabscale);
319 itab_tmp = _fjsp_dtox_v2r8(ewrt);
320 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
321 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
323 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
324 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
325 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
326 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
327 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
328 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
329 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
330 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
331 velec = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
332 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
334 /* LENNARD-JONES DISPERSION/REPULSION */
336 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
337 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
338 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
339 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
340 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
342 d = _fjsp_sub_v2r8(r00,rswitch);
343 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
344 d2 = _fjsp_mul_v2r8(d,d);
345 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
347 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
349 /* Evaluate switch function */
350 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
351 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
352 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
353 velec = _fjsp_mul_v2r8(velec,sw);
354 vvdw = _fjsp_mul_v2r8(vvdw,sw);
355 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velec = _fjsp_and_v2r8(velec,cutoff_mask);
359 velecsum = _fjsp_add_v2r8(velecsum,velec);
360 vvdw = _fjsp_and_v2r8(vvdw,cutoff_mask);
361 vvdwsum = _fjsp_add_v2r8(vvdwsum,vvdw);
363 fscal = _fjsp_add_v2r8(felec,fvdw);
365 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
367 /* Update vectorial force */
368 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
369 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
370 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
372 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
373 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
374 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
385 r01 = _fjsp_mul_v2r8(rsq01,rinv01);
387 /* EWALD ELECTROSTATICS */
389 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
390 ewrt = _fjsp_mul_v2r8(r01,ewtabscale);
391 itab_tmp = _fjsp_dtox_v2r8(ewrt);
392 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
393 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
395 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
396 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
397 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
398 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
399 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
400 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
401 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
402 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
403 velec = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
404 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
406 d = _fjsp_sub_v2r8(r01,rswitch);
407 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
408 d2 = _fjsp_mul_v2r8(d,d);
409 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
411 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
413 /* Evaluate switch function */
414 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
415 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
416 velec = _fjsp_mul_v2r8(velec,sw);
417 cutoff_mask = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velec = _fjsp_and_v2r8(velec,cutoff_mask);
421 velecsum = _fjsp_add_v2r8(velecsum,velec);
425 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
427 /* Update vectorial force */
428 fix0 = _fjsp_madd_v2r8(dx01,fscal,fix0);
429 fiy0 = _fjsp_madd_v2r8(dy01,fscal,fiy0);
430 fiz0 = _fjsp_madd_v2r8(dz01,fscal,fiz0);
432 fjx1 = _fjsp_madd_v2r8(dx01,fscal,fjx1);
433 fjy1 = _fjsp_madd_v2r8(dy01,fscal,fjy1);
434 fjz1 = _fjsp_madd_v2r8(dz01,fscal,fjz1);
438 /**************************
439 * CALCULATE INTERACTIONS *
440 **************************/
442 if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
445 r02 = _fjsp_mul_v2r8(rsq02,rinv02);
447 /* EWALD ELECTROSTATICS */
449 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
450 ewrt = _fjsp_mul_v2r8(r02,ewtabscale);
451 itab_tmp = _fjsp_dtox_v2r8(ewrt);
452 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
453 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
455 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
456 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
457 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
458 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
459 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
460 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
461 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
462 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
463 velec = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
464 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
466 d = _fjsp_sub_v2r8(r02,rswitch);
467 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
468 d2 = _fjsp_mul_v2r8(d,d);
469 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
471 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
473 /* Evaluate switch function */
474 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
475 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
476 velec = _fjsp_mul_v2r8(velec,sw);
477 cutoff_mask = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
479 /* Update potential sum for this i atom from the interaction with this j atom. */
480 velec = _fjsp_and_v2r8(velec,cutoff_mask);
481 velecsum = _fjsp_add_v2r8(velecsum,velec);
485 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
487 /* Update vectorial force */
488 fix0 = _fjsp_madd_v2r8(dx02,fscal,fix0);
489 fiy0 = _fjsp_madd_v2r8(dy02,fscal,fiy0);
490 fiz0 = _fjsp_madd_v2r8(dz02,fscal,fiz0);
492 fjx2 = _fjsp_madd_v2r8(dx02,fscal,fjx2);
493 fjy2 = _fjsp_madd_v2r8(dy02,fscal,fjy2);
494 fjz2 = _fjsp_madd_v2r8(dz02,fscal,fjz2);
498 /**************************
499 * CALCULATE INTERACTIONS *
500 **************************/
502 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
505 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
507 /* EWALD ELECTROSTATICS */
509 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
510 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
511 itab_tmp = _fjsp_dtox_v2r8(ewrt);
512 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
513 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
515 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
516 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
517 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
518 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
519 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
520 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
521 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
522 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
523 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
524 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
526 d = _fjsp_sub_v2r8(r10,rswitch);
527 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
528 d2 = _fjsp_mul_v2r8(d,d);
529 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
531 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
533 /* Evaluate switch function */
534 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
535 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
536 velec = _fjsp_mul_v2r8(velec,sw);
537 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velec = _fjsp_and_v2r8(velec,cutoff_mask);
541 velecsum = _fjsp_add_v2r8(velecsum,velec);
545 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
547 /* Update vectorial force */
548 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
549 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
550 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
552 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
553 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
554 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
558 /**************************
559 * CALCULATE INTERACTIONS *
560 **************************/
562 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
565 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
567 /* EWALD ELECTROSTATICS */
569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
570 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
571 itab_tmp = _fjsp_dtox_v2r8(ewrt);
572 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
573 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
575 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
576 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
577 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
578 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
579 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
580 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
581 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
582 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
583 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
584 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
586 d = _fjsp_sub_v2r8(r11,rswitch);
587 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
588 d2 = _fjsp_mul_v2r8(d,d);
589 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
591 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
593 /* Evaluate switch function */
594 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
595 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
596 velec = _fjsp_mul_v2r8(velec,sw);
597 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
599 /* Update potential sum for this i atom from the interaction with this j atom. */
600 velec = _fjsp_and_v2r8(velec,cutoff_mask);
601 velecsum = _fjsp_add_v2r8(velecsum,velec);
605 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
607 /* Update vectorial force */
608 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
609 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
610 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
612 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
613 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
614 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
618 /**************************
619 * CALCULATE INTERACTIONS *
620 **************************/
622 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
625 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
627 /* EWALD ELECTROSTATICS */
629 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
630 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
631 itab_tmp = _fjsp_dtox_v2r8(ewrt);
632 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
633 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
635 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
636 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
637 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
638 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
639 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
640 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
641 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
642 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
643 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
644 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
646 d = _fjsp_sub_v2r8(r12,rswitch);
647 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
648 d2 = _fjsp_mul_v2r8(d,d);
649 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
651 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
653 /* Evaluate switch function */
654 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
655 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
656 velec = _fjsp_mul_v2r8(velec,sw);
657 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
659 /* Update potential sum for this i atom from the interaction with this j atom. */
660 velec = _fjsp_and_v2r8(velec,cutoff_mask);
661 velecsum = _fjsp_add_v2r8(velecsum,velec);
665 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
667 /* Update vectorial force */
668 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
669 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
670 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
672 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
673 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
674 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
678 /**************************
679 * CALCULATE INTERACTIONS *
680 **************************/
682 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
685 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
687 /* EWALD ELECTROSTATICS */
689 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
690 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
691 itab_tmp = _fjsp_dtox_v2r8(ewrt);
692 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
693 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
695 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
696 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
697 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
698 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
699 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
700 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
701 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
702 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
703 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
704 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
706 d = _fjsp_sub_v2r8(r20,rswitch);
707 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
708 d2 = _fjsp_mul_v2r8(d,d);
709 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
711 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
713 /* Evaluate switch function */
714 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
715 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
716 velec = _fjsp_mul_v2r8(velec,sw);
717 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
719 /* Update potential sum for this i atom from the interaction with this j atom. */
720 velec = _fjsp_and_v2r8(velec,cutoff_mask);
721 velecsum = _fjsp_add_v2r8(velecsum,velec);
725 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
727 /* Update vectorial force */
728 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
729 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
730 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
732 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
733 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
734 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
738 /**************************
739 * CALCULATE INTERACTIONS *
740 **************************/
742 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
745 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
747 /* EWALD ELECTROSTATICS */
749 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
750 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
751 itab_tmp = _fjsp_dtox_v2r8(ewrt);
752 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
753 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
755 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
756 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
757 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
758 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
759 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
760 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
761 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
762 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
763 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
764 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
766 d = _fjsp_sub_v2r8(r21,rswitch);
767 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
768 d2 = _fjsp_mul_v2r8(d,d);
769 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
771 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
773 /* Evaluate switch function */
774 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
775 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
776 velec = _fjsp_mul_v2r8(velec,sw);
777 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
779 /* Update potential sum for this i atom from the interaction with this j atom. */
780 velec = _fjsp_and_v2r8(velec,cutoff_mask);
781 velecsum = _fjsp_add_v2r8(velecsum,velec);
785 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
787 /* Update vectorial force */
788 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
789 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
790 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
792 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
793 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
794 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
798 /**************************
799 * CALCULATE INTERACTIONS *
800 **************************/
802 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
805 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
807 /* EWALD ELECTROSTATICS */
809 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
810 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
811 itab_tmp = _fjsp_dtox_v2r8(ewrt);
812 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
813 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
815 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
816 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
817 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
818 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
819 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
820 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
821 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
822 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
823 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
824 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
826 d = _fjsp_sub_v2r8(r22,rswitch);
827 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
828 d2 = _fjsp_mul_v2r8(d,d);
829 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
831 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
833 /* Evaluate switch function */
834 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
835 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
836 velec = _fjsp_mul_v2r8(velec,sw);
837 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
839 /* Update potential sum for this i atom from the interaction with this j atom. */
840 velec = _fjsp_and_v2r8(velec,cutoff_mask);
841 velecsum = _fjsp_add_v2r8(velecsum,velec);
845 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
847 /* Update vectorial force */
848 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
849 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
850 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
852 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
853 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
854 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
858 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
860 /* Inner loop uses 630 flops */
867 j_coord_offsetA = DIM*jnrA;
869 /* load j atom coordinates */
870 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
871 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
873 /* Calculate displacement vector */
874 dx00 = _fjsp_sub_v2r8(ix0,jx0);
875 dy00 = _fjsp_sub_v2r8(iy0,jy0);
876 dz00 = _fjsp_sub_v2r8(iz0,jz0);
877 dx01 = _fjsp_sub_v2r8(ix0,jx1);
878 dy01 = _fjsp_sub_v2r8(iy0,jy1);
879 dz01 = _fjsp_sub_v2r8(iz0,jz1);
880 dx02 = _fjsp_sub_v2r8(ix0,jx2);
881 dy02 = _fjsp_sub_v2r8(iy0,jy2);
882 dz02 = _fjsp_sub_v2r8(iz0,jz2);
883 dx10 = _fjsp_sub_v2r8(ix1,jx0);
884 dy10 = _fjsp_sub_v2r8(iy1,jy0);
885 dz10 = _fjsp_sub_v2r8(iz1,jz0);
886 dx11 = _fjsp_sub_v2r8(ix1,jx1);
887 dy11 = _fjsp_sub_v2r8(iy1,jy1);
888 dz11 = _fjsp_sub_v2r8(iz1,jz1);
889 dx12 = _fjsp_sub_v2r8(ix1,jx2);
890 dy12 = _fjsp_sub_v2r8(iy1,jy2);
891 dz12 = _fjsp_sub_v2r8(iz1,jz2);
892 dx20 = _fjsp_sub_v2r8(ix2,jx0);
893 dy20 = _fjsp_sub_v2r8(iy2,jy0);
894 dz20 = _fjsp_sub_v2r8(iz2,jz0);
895 dx21 = _fjsp_sub_v2r8(ix2,jx1);
896 dy21 = _fjsp_sub_v2r8(iy2,jy1);
897 dz21 = _fjsp_sub_v2r8(iz2,jz1);
898 dx22 = _fjsp_sub_v2r8(ix2,jx2);
899 dy22 = _fjsp_sub_v2r8(iy2,jy2);
900 dz22 = _fjsp_sub_v2r8(iz2,jz2);
902 /* Calculate squared distance and things based on it */
903 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
904 rsq01 = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
905 rsq02 = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
906 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
907 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
908 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
909 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
910 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
911 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
913 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
914 rinv01 = gmx_fjsp_invsqrt_v2r8(rsq01);
915 rinv02 = gmx_fjsp_invsqrt_v2r8(rsq02);
916 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
917 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
918 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
919 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
920 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
921 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
923 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
924 rinvsq01 = _fjsp_mul_v2r8(rinv01,rinv01);
925 rinvsq02 = _fjsp_mul_v2r8(rinv02,rinv02);
926 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
927 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
928 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
929 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
930 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
931 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
933 fjx0 = _fjsp_setzero_v2r8();
934 fjy0 = _fjsp_setzero_v2r8();
935 fjz0 = _fjsp_setzero_v2r8();
936 fjx1 = _fjsp_setzero_v2r8();
937 fjy1 = _fjsp_setzero_v2r8();
938 fjz1 = _fjsp_setzero_v2r8();
939 fjx2 = _fjsp_setzero_v2r8();
940 fjy2 = _fjsp_setzero_v2r8();
941 fjz2 = _fjsp_setzero_v2r8();
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
950 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
952 /* EWALD ELECTROSTATICS */
954 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
955 ewrt = _fjsp_mul_v2r8(r00,ewtabscale);
956 itab_tmp = _fjsp_dtox_v2r8(ewrt);
957 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
958 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
960 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
961 ewtabD = _fjsp_setzero_v2r8();
962 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
963 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
964 ewtabFn = _fjsp_setzero_v2r8();
965 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
966 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
967 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
968 velec = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
969 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
971 /* LENNARD-JONES DISPERSION/REPULSION */
973 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
974 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
975 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
976 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
977 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
979 d = _fjsp_sub_v2r8(r00,rswitch);
980 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
981 d2 = _fjsp_mul_v2r8(d,d);
982 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
984 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
986 /* Evaluate switch function */
987 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
988 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
989 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
990 velec = _fjsp_mul_v2r8(velec,sw);
991 vvdw = _fjsp_mul_v2r8(vvdw,sw);
992 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _fjsp_and_v2r8(velec,cutoff_mask);
996 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
997 velecsum = _fjsp_add_v2r8(velecsum,velec);
998 vvdw = _fjsp_and_v2r8(vvdw,cutoff_mask);
999 vvdw = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
1000 vvdwsum = _fjsp_add_v2r8(vvdwsum,vvdw);
1002 fscal = _fjsp_add_v2r8(felec,fvdw);
1004 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1006 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1008 /* Update vectorial force */
1009 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
1010 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1011 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1013 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1014 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1015 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1019 /**************************
1020 * CALCULATE INTERACTIONS *
1021 **************************/
1023 if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
1026 r01 = _fjsp_mul_v2r8(rsq01,rinv01);
1028 /* EWALD ELECTROSTATICS */
1030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031 ewrt = _fjsp_mul_v2r8(r01,ewtabscale);
1032 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1033 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1034 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1036 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1037 ewtabD = _fjsp_setzero_v2r8();
1038 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1039 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1040 ewtabFn = _fjsp_setzero_v2r8();
1041 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1042 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1043 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1044 velec = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
1045 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1047 d = _fjsp_sub_v2r8(r01,rswitch);
1048 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1049 d2 = _fjsp_mul_v2r8(d,d);
1050 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1052 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1054 /* Evaluate switch function */
1055 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1056 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
1057 velec = _fjsp_mul_v2r8(velec,sw);
1058 cutoff_mask = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
1060 /* Update potential sum for this i atom from the interaction with this j atom. */
1061 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1062 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1063 velecsum = _fjsp_add_v2r8(velecsum,velec);
1067 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1069 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1071 /* Update vectorial force */
1072 fix0 = _fjsp_madd_v2r8(dx01,fscal,fix0);
1073 fiy0 = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1074 fiz0 = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1076 fjx1 = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1077 fjy1 = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1078 fjz1 = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1082 /**************************
1083 * CALCULATE INTERACTIONS *
1084 **************************/
1086 if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
1089 r02 = _fjsp_mul_v2r8(rsq02,rinv02);
1091 /* EWALD ELECTROSTATICS */
1093 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1094 ewrt = _fjsp_mul_v2r8(r02,ewtabscale);
1095 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1096 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1097 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1099 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1100 ewtabD = _fjsp_setzero_v2r8();
1101 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1102 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1103 ewtabFn = _fjsp_setzero_v2r8();
1104 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1105 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1106 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1107 velec = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
1108 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1110 d = _fjsp_sub_v2r8(r02,rswitch);
1111 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1112 d2 = _fjsp_mul_v2r8(d,d);
1113 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1115 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1117 /* Evaluate switch function */
1118 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1119 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
1120 velec = _fjsp_mul_v2r8(velec,sw);
1121 cutoff_mask = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
1123 /* Update potential sum for this i atom from the interaction with this j atom. */
1124 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1125 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1126 velecsum = _fjsp_add_v2r8(velecsum,velec);
1130 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1132 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1134 /* Update vectorial force */
1135 fix0 = _fjsp_madd_v2r8(dx02,fscal,fix0);
1136 fiy0 = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1137 fiz0 = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1139 fjx2 = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1140 fjy2 = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1141 fjz2 = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1145 /**************************
1146 * CALCULATE INTERACTIONS *
1147 **************************/
1149 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1152 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
1154 /* EWALD ELECTROSTATICS */
1156 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1157 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
1158 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1159 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1160 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1162 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1163 ewtabD = _fjsp_setzero_v2r8();
1164 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1165 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1166 ewtabFn = _fjsp_setzero_v2r8();
1167 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1168 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1169 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1170 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
1171 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1173 d = _fjsp_sub_v2r8(r10,rswitch);
1174 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1175 d2 = _fjsp_mul_v2r8(d,d);
1176 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1178 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1180 /* Evaluate switch function */
1181 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1182 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
1183 velec = _fjsp_mul_v2r8(velec,sw);
1184 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
1186 /* Update potential sum for this i atom from the interaction with this j atom. */
1187 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1188 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1189 velecsum = _fjsp_add_v2r8(velecsum,velec);
1193 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1195 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1197 /* Update vectorial force */
1198 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
1199 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1200 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1202 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1203 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1204 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1208 /**************************
1209 * CALCULATE INTERACTIONS *
1210 **************************/
1212 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1215 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
1217 /* EWALD ELECTROSTATICS */
1219 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1220 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
1221 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1222 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1223 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1225 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1226 ewtabD = _fjsp_setzero_v2r8();
1227 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1228 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1229 ewtabFn = _fjsp_setzero_v2r8();
1230 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1231 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1232 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1233 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
1234 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1236 d = _fjsp_sub_v2r8(r11,rswitch);
1237 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1238 d2 = _fjsp_mul_v2r8(d,d);
1239 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1241 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1243 /* Evaluate switch function */
1244 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1245 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
1246 velec = _fjsp_mul_v2r8(velec,sw);
1247 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1249 /* Update potential sum for this i atom from the interaction with this j atom. */
1250 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1251 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1252 velecsum = _fjsp_add_v2r8(velecsum,velec);
1256 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1258 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1260 /* Update vectorial force */
1261 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
1262 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1263 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1265 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1266 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1267 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
1278 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
1280 /* EWALD ELECTROSTATICS */
1282 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1283 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
1284 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1285 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1286 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1288 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1289 ewtabD = _fjsp_setzero_v2r8();
1290 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1291 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1292 ewtabFn = _fjsp_setzero_v2r8();
1293 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1294 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1295 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1296 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
1297 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1299 d = _fjsp_sub_v2r8(r12,rswitch);
1300 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1301 d2 = _fjsp_mul_v2r8(d,d);
1302 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1304 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1306 /* Evaluate switch function */
1307 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1308 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
1309 velec = _fjsp_mul_v2r8(velec,sw);
1310 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
1312 /* Update potential sum for this i atom from the interaction with this j atom. */
1313 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1314 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1315 velecsum = _fjsp_add_v2r8(velecsum,velec);
1319 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1321 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1323 /* Update vectorial force */
1324 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
1325 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1326 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1328 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1329 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1330 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1334 /**************************
1335 * CALCULATE INTERACTIONS *
1336 **************************/
1338 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
1341 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
1343 /* EWALD ELECTROSTATICS */
1345 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1346 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
1347 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1348 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1349 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1351 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1352 ewtabD = _fjsp_setzero_v2r8();
1353 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1354 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1355 ewtabFn = _fjsp_setzero_v2r8();
1356 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1357 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1358 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1359 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
1360 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1362 d = _fjsp_sub_v2r8(r20,rswitch);
1363 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1364 d2 = _fjsp_mul_v2r8(d,d);
1365 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1367 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1369 /* Evaluate switch function */
1370 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1371 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
1372 velec = _fjsp_mul_v2r8(velec,sw);
1373 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
1375 /* Update potential sum for this i atom from the interaction with this j atom. */
1376 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1377 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1378 velecsum = _fjsp_add_v2r8(velecsum,velec);
1382 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1384 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1386 /* Update vectorial force */
1387 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
1388 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1389 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1391 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1392 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1393 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1397 /**************************
1398 * CALCULATE INTERACTIONS *
1399 **************************/
1401 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
1404 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
1406 /* EWALD ELECTROSTATICS */
1408 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1409 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
1410 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1411 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1412 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1414 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1415 ewtabD = _fjsp_setzero_v2r8();
1416 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1417 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1418 ewtabFn = _fjsp_setzero_v2r8();
1419 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1420 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1421 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1422 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
1423 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1425 d = _fjsp_sub_v2r8(r21,rswitch);
1426 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1427 d2 = _fjsp_mul_v2r8(d,d);
1428 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1430 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1432 /* Evaluate switch function */
1433 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1434 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
1435 velec = _fjsp_mul_v2r8(velec,sw);
1436 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
1438 /* Update potential sum for this i atom from the interaction with this j atom. */
1439 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1440 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1441 velecsum = _fjsp_add_v2r8(velecsum,velec);
1445 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1447 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1449 /* Update vectorial force */
1450 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
1451 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1452 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1454 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1455 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1456 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1460 /**************************
1461 * CALCULATE INTERACTIONS *
1462 **************************/
1464 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1467 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
1469 /* EWALD ELECTROSTATICS */
1471 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1472 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
1473 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1474 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1475 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1477 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1478 ewtabD = _fjsp_setzero_v2r8();
1479 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1480 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1481 ewtabFn = _fjsp_setzero_v2r8();
1482 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1483 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1484 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1485 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
1486 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1488 d = _fjsp_sub_v2r8(r22,rswitch);
1489 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1490 d2 = _fjsp_mul_v2r8(d,d);
1491 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1493 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1495 /* Evaluate switch function */
1496 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1497 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
1498 velec = _fjsp_mul_v2r8(velec,sw);
1499 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1501 /* Update potential sum for this i atom from the interaction with this j atom. */
1502 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1503 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1504 velecsum = _fjsp_add_v2r8(velecsum,velec);
1508 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1510 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1512 /* Update vectorial force */
1513 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
1514 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1515 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1517 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1518 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1519 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1523 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1525 /* Inner loop uses 630 flops */
1528 /* End of innermost loop */
1530 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1531 f+i_coord_offset,fshift+i_shift_offset);
1534 /* Update potential energies */
1535 gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1536 gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
1538 /* Increment number of inner iterations */
1539 inneriter += j_index_end - j_index_start;
1541 /* Outer loop uses 20 flops */
1544 /* Increment number of outer iterations */
1547 /* Update outer/inner flops */
1549 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*630);
1552 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
1553 * Electrostatics interaction: Ewald
1554 * VdW interaction: LennardJones
1555 * Geometry: Water3-Water3
1556 * Calculate force/pot: Force
1559 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
1560 (t_nblist * gmx_restrict nlist,
1561 rvec * gmx_restrict xx,
1562 rvec * gmx_restrict ff,
1563 t_forcerec * gmx_restrict fr,
1564 t_mdatoms * gmx_restrict mdatoms,
1565 nb_kernel_data_t * gmx_restrict kernel_data,
1566 t_nrnb * gmx_restrict nrnb)
1568 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1569 * just 0 for non-waters.
1570 * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1571 * jnr indices corresponding to data put in the four positions in the SIMD register.
1573 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1574 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1576 int j_coord_offsetA,j_coord_offsetB;
1577 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1578 real rcutoff_scalar;
1579 real *shiftvec,*fshift,*x,*f;
1580 _fjsp_v2r8 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1582 _fjsp_v2r8 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1584 _fjsp_v2r8 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1586 _fjsp_v2r8 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1587 int vdwjidx0A,vdwjidx0B;
1588 _fjsp_v2r8 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1589 int vdwjidx1A,vdwjidx1B;
1590 _fjsp_v2r8 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1591 int vdwjidx2A,vdwjidx2B;
1592 _fjsp_v2r8 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1593 _fjsp_v2r8 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1594 _fjsp_v2r8 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1595 _fjsp_v2r8 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1596 _fjsp_v2r8 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1597 _fjsp_v2r8 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1598 _fjsp_v2r8 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1599 _fjsp_v2r8 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1600 _fjsp_v2r8 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1601 _fjsp_v2r8 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1602 _fjsp_v2r8 velec,felec,velecsum,facel,crf,krf,krf2;
1605 _fjsp_v2r8 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1608 _fjsp_v2r8 one_sixth = gmx_fjsp_set1_v2r8(1.0/6.0);
1609 _fjsp_v2r8 one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
1610 _fjsp_v2r8 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1612 _fjsp_v2r8 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1613 real rswitch_scalar,d_scalar;
1614 _fjsp_v2r8 itab_tmp;
1615 _fjsp_v2r8 dummy_mask,cutoff_mask;
1616 _fjsp_v2r8 one = gmx_fjsp_set1_v2r8(1.0);
1617 _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
1618 union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1625 jindex = nlist->jindex;
1627 shiftidx = nlist->shift;
1629 shiftvec = fr->shift_vec[0];
1630 fshift = fr->fshift[0];
1631 facel = gmx_fjsp_set1_v2r8(fr->epsfac);
1632 charge = mdatoms->chargeA;
1633 nvdwtype = fr->ntype;
1634 vdwparam = fr->nbfp;
1635 vdwtype = mdatoms->typeA;
1637 sh_ewald = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1638 ewtab = fr->ic->tabq_coul_FDV0;
1639 ewtabscale = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1640 ewtabhalfspace = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1642 /* Setup water-specific parameters */
1643 inr = nlist->iinr[0];
1644 iq0 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
1645 iq1 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1646 iq2 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1647 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1649 jq0 = gmx_fjsp_set1_v2r8(charge[inr+0]);
1650 jq1 = gmx_fjsp_set1_v2r8(charge[inr+1]);
1651 jq2 = gmx_fjsp_set1_v2r8(charge[inr+2]);
1652 vdwjidx0A = 2*vdwtype[inr+0];
1653 qq00 = _fjsp_mul_v2r8(iq0,jq0);
1654 c6_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1655 c12_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1656 qq01 = _fjsp_mul_v2r8(iq0,jq1);
1657 qq02 = _fjsp_mul_v2r8(iq0,jq2);
1658 qq10 = _fjsp_mul_v2r8(iq1,jq0);
1659 qq11 = _fjsp_mul_v2r8(iq1,jq1);
1660 qq12 = _fjsp_mul_v2r8(iq1,jq2);
1661 qq20 = _fjsp_mul_v2r8(iq2,jq0);
1662 qq21 = _fjsp_mul_v2r8(iq2,jq1);
1663 qq22 = _fjsp_mul_v2r8(iq2,jq2);
1665 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1666 rcutoff_scalar = fr->rcoulomb;
1667 rcutoff = gmx_fjsp_set1_v2r8(rcutoff_scalar);
1668 rcutoff2 = _fjsp_mul_v2r8(rcutoff,rcutoff);
1670 rswitch_scalar = fr->rcoulomb_switch;
1671 rswitch = gmx_fjsp_set1_v2r8(rswitch_scalar);
1672 /* Setup switch parameters */
1673 d_scalar = rcutoff_scalar-rswitch_scalar;
1674 d = gmx_fjsp_set1_v2r8(d_scalar);
1675 swV3 = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
1676 swV4 = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1677 swV5 = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1678 swF2 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
1679 swF3 = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1680 swF4 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1682 /* Avoid stupid compiler warnings */
1684 j_coord_offsetA = 0;
1685 j_coord_offsetB = 0;
1690 /* Start outer loop over neighborlists */
1691 for(iidx=0; iidx<nri; iidx++)
1693 /* Load shift vector for this list */
1694 i_shift_offset = DIM*shiftidx[iidx];
1696 /* Load limits for loop over neighbors */
1697 j_index_start = jindex[iidx];
1698 j_index_end = jindex[iidx+1];
1700 /* Get outer coordinate index */
1702 i_coord_offset = DIM*inr;
1704 /* Load i particle coords and add shift vector */
1705 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1706 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1708 fix0 = _fjsp_setzero_v2r8();
1709 fiy0 = _fjsp_setzero_v2r8();
1710 fiz0 = _fjsp_setzero_v2r8();
1711 fix1 = _fjsp_setzero_v2r8();
1712 fiy1 = _fjsp_setzero_v2r8();
1713 fiz1 = _fjsp_setzero_v2r8();
1714 fix2 = _fjsp_setzero_v2r8();
1715 fiy2 = _fjsp_setzero_v2r8();
1716 fiz2 = _fjsp_setzero_v2r8();
1718 /* Start inner kernel loop */
1719 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1722 /* Get j neighbor index, and coordinate index */
1724 jnrB = jjnr[jidx+1];
1725 j_coord_offsetA = DIM*jnrA;
1726 j_coord_offsetB = DIM*jnrB;
1728 /* load j atom coordinates */
1729 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1730 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1732 /* Calculate displacement vector */
1733 dx00 = _fjsp_sub_v2r8(ix0,jx0);
1734 dy00 = _fjsp_sub_v2r8(iy0,jy0);
1735 dz00 = _fjsp_sub_v2r8(iz0,jz0);
1736 dx01 = _fjsp_sub_v2r8(ix0,jx1);
1737 dy01 = _fjsp_sub_v2r8(iy0,jy1);
1738 dz01 = _fjsp_sub_v2r8(iz0,jz1);
1739 dx02 = _fjsp_sub_v2r8(ix0,jx2);
1740 dy02 = _fjsp_sub_v2r8(iy0,jy2);
1741 dz02 = _fjsp_sub_v2r8(iz0,jz2);
1742 dx10 = _fjsp_sub_v2r8(ix1,jx0);
1743 dy10 = _fjsp_sub_v2r8(iy1,jy0);
1744 dz10 = _fjsp_sub_v2r8(iz1,jz0);
1745 dx11 = _fjsp_sub_v2r8(ix1,jx1);
1746 dy11 = _fjsp_sub_v2r8(iy1,jy1);
1747 dz11 = _fjsp_sub_v2r8(iz1,jz1);
1748 dx12 = _fjsp_sub_v2r8(ix1,jx2);
1749 dy12 = _fjsp_sub_v2r8(iy1,jy2);
1750 dz12 = _fjsp_sub_v2r8(iz1,jz2);
1751 dx20 = _fjsp_sub_v2r8(ix2,jx0);
1752 dy20 = _fjsp_sub_v2r8(iy2,jy0);
1753 dz20 = _fjsp_sub_v2r8(iz2,jz0);
1754 dx21 = _fjsp_sub_v2r8(ix2,jx1);
1755 dy21 = _fjsp_sub_v2r8(iy2,jy1);
1756 dz21 = _fjsp_sub_v2r8(iz2,jz1);
1757 dx22 = _fjsp_sub_v2r8(ix2,jx2);
1758 dy22 = _fjsp_sub_v2r8(iy2,jy2);
1759 dz22 = _fjsp_sub_v2r8(iz2,jz2);
1761 /* Calculate squared distance and things based on it */
1762 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1763 rsq01 = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
1764 rsq02 = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
1765 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1766 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1767 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1768 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1769 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1770 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1772 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
1773 rinv01 = gmx_fjsp_invsqrt_v2r8(rsq01);
1774 rinv02 = gmx_fjsp_invsqrt_v2r8(rsq02);
1775 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
1776 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
1777 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
1778 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
1779 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
1780 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
1782 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
1783 rinvsq01 = _fjsp_mul_v2r8(rinv01,rinv01);
1784 rinvsq02 = _fjsp_mul_v2r8(rinv02,rinv02);
1785 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
1786 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
1787 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
1788 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
1789 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
1790 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
1792 fjx0 = _fjsp_setzero_v2r8();
1793 fjy0 = _fjsp_setzero_v2r8();
1794 fjz0 = _fjsp_setzero_v2r8();
1795 fjx1 = _fjsp_setzero_v2r8();
1796 fjy1 = _fjsp_setzero_v2r8();
1797 fjz1 = _fjsp_setzero_v2r8();
1798 fjx2 = _fjsp_setzero_v2r8();
1799 fjy2 = _fjsp_setzero_v2r8();
1800 fjz2 = _fjsp_setzero_v2r8();
1802 /**************************
1803 * CALCULATE INTERACTIONS *
1804 **************************/
1806 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1809 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
1811 /* EWALD ELECTROSTATICS */
1813 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1814 ewrt = _fjsp_mul_v2r8(r00,ewtabscale);
1815 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1816 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1817 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1819 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1820 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1821 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1822 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1823 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1824 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1825 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1826 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1827 velec = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
1828 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
1830 /* LENNARD-JONES DISPERSION/REPULSION */
1832 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1833 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
1834 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
1835 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
1836 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
1838 d = _fjsp_sub_v2r8(r00,rswitch);
1839 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1840 d2 = _fjsp_mul_v2r8(d,d);
1841 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1843 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1845 /* Evaluate switch function */
1846 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1847 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
1848 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
1849 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1851 fscal = _fjsp_add_v2r8(felec,fvdw);
1853 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1855 /* Update vectorial force */
1856 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
1857 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1858 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1860 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1861 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1862 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1866 /**************************
1867 * CALCULATE INTERACTIONS *
1868 **************************/
1870 if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
1873 r01 = _fjsp_mul_v2r8(rsq01,rinv01);
1875 /* EWALD ELECTROSTATICS */
1877 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1878 ewrt = _fjsp_mul_v2r8(r01,ewtabscale);
1879 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1880 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1881 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1883 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1884 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1885 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1886 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1887 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1888 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1889 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1890 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1891 velec = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
1892 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1894 d = _fjsp_sub_v2r8(r01,rswitch);
1895 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1896 d2 = _fjsp_mul_v2r8(d,d);
1897 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1899 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1901 /* Evaluate switch function */
1902 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1903 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
1904 cutoff_mask = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
1908 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1910 /* Update vectorial force */
1911 fix0 = _fjsp_madd_v2r8(dx01,fscal,fix0);
1912 fiy0 = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1913 fiz0 = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1915 fjx1 = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1916 fjy1 = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1917 fjz1 = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
1928 r02 = _fjsp_mul_v2r8(rsq02,rinv02);
1930 /* EWALD ELECTROSTATICS */
1932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1933 ewrt = _fjsp_mul_v2r8(r02,ewtabscale);
1934 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1935 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1936 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1938 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1939 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1940 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1941 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1942 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1943 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1944 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1945 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1946 velec = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
1947 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1949 d = _fjsp_sub_v2r8(r02,rswitch);
1950 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1951 d2 = _fjsp_mul_v2r8(d,d);
1952 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1954 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1956 /* Evaluate switch function */
1957 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1958 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
1959 cutoff_mask = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
1963 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1965 /* Update vectorial force */
1966 fix0 = _fjsp_madd_v2r8(dx02,fscal,fix0);
1967 fiy0 = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1968 fiz0 = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1970 fjx2 = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1971 fjy2 = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1972 fjz2 = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1983 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
1985 /* EWALD ELECTROSTATICS */
1987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1988 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
1989 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1990 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1991 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1993 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1994 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1995 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1996 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1997 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1998 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1999 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2000 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2001 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
2002 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
2004 d = _fjsp_sub_v2r8(r10,rswitch);
2005 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2006 d2 = _fjsp_mul_v2r8(d,d);
2007 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2009 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2011 /* Evaluate switch function */
2012 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2013 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
2014 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
2018 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2020 /* Update vectorial force */
2021 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
2022 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
2023 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
2025 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
2026 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
2027 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
2031 /**************************
2032 * CALCULATE INTERACTIONS *
2033 **************************/
2035 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
2038 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
2040 /* EWALD ELECTROSTATICS */
2042 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2043 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
2044 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2045 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2046 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2048 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2049 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2050 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2051 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2052 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2053 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2054 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2055 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2056 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
2057 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
2059 d = _fjsp_sub_v2r8(r11,rswitch);
2060 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2061 d2 = _fjsp_mul_v2r8(d,d);
2062 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2064 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2066 /* Evaluate switch function */
2067 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2068 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2069 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2073 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2075 /* Update vectorial force */
2076 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
2077 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2078 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2080 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2081 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2082 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2086 /**************************
2087 * CALCULATE INTERACTIONS *
2088 **************************/
2090 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2093 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
2095 /* EWALD ELECTROSTATICS */
2097 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
2099 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2100 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2101 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2103 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2104 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2105 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2106 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2107 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2108 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2109 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2110 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2111 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2112 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2114 d = _fjsp_sub_v2r8(r12,rswitch);
2115 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2116 d2 = _fjsp_mul_v2r8(d,d);
2117 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2119 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2121 /* Evaluate switch function */
2122 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2123 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2124 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2128 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2130 /* Update vectorial force */
2131 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
2132 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2133 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2135 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2136 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2137 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2141 /**************************
2142 * CALCULATE INTERACTIONS *
2143 **************************/
2145 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
2148 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
2150 /* EWALD ELECTROSTATICS */
2152 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2153 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
2154 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2155 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2156 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2158 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2159 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2160 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2161 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2162 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2163 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2164 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2165 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2166 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
2167 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
2169 d = _fjsp_sub_v2r8(r20,rswitch);
2170 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2171 d2 = _fjsp_mul_v2r8(d,d);
2172 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2174 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2176 /* Evaluate switch function */
2177 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2178 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
2179 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
2183 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2185 /* Update vectorial force */
2186 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
2187 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
2188 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
2190 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
2191 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
2192 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
2196 /**************************
2197 * CALCULATE INTERACTIONS *
2198 **************************/
2200 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2203 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
2205 /* EWALD ELECTROSTATICS */
2207 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2208 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
2209 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2210 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2211 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2213 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2214 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2215 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2216 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2217 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2218 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2219 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2220 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2221 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2222 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2224 d = _fjsp_sub_v2r8(r21,rswitch);
2225 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2226 d2 = _fjsp_mul_v2r8(d,d);
2227 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2229 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2231 /* Evaluate switch function */
2232 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2233 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2234 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2238 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2240 /* Update vectorial force */
2241 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
2242 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2243 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2245 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2246 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2247 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2258 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
2260 /* EWALD ELECTROSTATICS */
2262 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2263 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
2264 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2265 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2266 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2268 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2269 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2270 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2271 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2272 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2273 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2274 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2275 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2276 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2277 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2279 d = _fjsp_sub_v2r8(r22,rswitch);
2280 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2281 d2 = _fjsp_mul_v2r8(d,d);
2282 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2284 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2286 /* Evaluate switch function */
2287 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2288 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2289 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2293 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2295 /* Update vectorial force */
2296 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
2297 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2298 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2300 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2301 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2302 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2306 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2308 /* Inner loop uses 600 flops */
2311 if(jidx<j_index_end)
2315 j_coord_offsetA = DIM*jnrA;
2317 /* load j atom coordinates */
2318 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
2319 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2321 /* Calculate displacement vector */
2322 dx00 = _fjsp_sub_v2r8(ix0,jx0);
2323 dy00 = _fjsp_sub_v2r8(iy0,jy0);
2324 dz00 = _fjsp_sub_v2r8(iz0,jz0);
2325 dx01 = _fjsp_sub_v2r8(ix0,jx1);
2326 dy01 = _fjsp_sub_v2r8(iy0,jy1);
2327 dz01 = _fjsp_sub_v2r8(iz0,jz1);
2328 dx02 = _fjsp_sub_v2r8(ix0,jx2);
2329 dy02 = _fjsp_sub_v2r8(iy0,jy2);
2330 dz02 = _fjsp_sub_v2r8(iz0,jz2);
2331 dx10 = _fjsp_sub_v2r8(ix1,jx0);
2332 dy10 = _fjsp_sub_v2r8(iy1,jy0);
2333 dz10 = _fjsp_sub_v2r8(iz1,jz0);
2334 dx11 = _fjsp_sub_v2r8(ix1,jx1);
2335 dy11 = _fjsp_sub_v2r8(iy1,jy1);
2336 dz11 = _fjsp_sub_v2r8(iz1,jz1);
2337 dx12 = _fjsp_sub_v2r8(ix1,jx2);
2338 dy12 = _fjsp_sub_v2r8(iy1,jy2);
2339 dz12 = _fjsp_sub_v2r8(iz1,jz2);
2340 dx20 = _fjsp_sub_v2r8(ix2,jx0);
2341 dy20 = _fjsp_sub_v2r8(iy2,jy0);
2342 dz20 = _fjsp_sub_v2r8(iz2,jz0);
2343 dx21 = _fjsp_sub_v2r8(ix2,jx1);
2344 dy21 = _fjsp_sub_v2r8(iy2,jy1);
2345 dz21 = _fjsp_sub_v2r8(iz2,jz1);
2346 dx22 = _fjsp_sub_v2r8(ix2,jx2);
2347 dy22 = _fjsp_sub_v2r8(iy2,jy2);
2348 dz22 = _fjsp_sub_v2r8(iz2,jz2);
2350 /* Calculate squared distance and things based on it */
2351 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
2352 rsq01 = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
2353 rsq02 = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
2354 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
2355 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
2356 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
2357 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
2358 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
2359 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
2361 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
2362 rinv01 = gmx_fjsp_invsqrt_v2r8(rsq01);
2363 rinv02 = gmx_fjsp_invsqrt_v2r8(rsq02);
2364 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
2365 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
2366 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
2367 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
2368 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
2369 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
2371 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
2372 rinvsq01 = _fjsp_mul_v2r8(rinv01,rinv01);
2373 rinvsq02 = _fjsp_mul_v2r8(rinv02,rinv02);
2374 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
2375 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
2376 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
2377 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
2378 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
2379 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
2381 fjx0 = _fjsp_setzero_v2r8();
2382 fjy0 = _fjsp_setzero_v2r8();
2383 fjz0 = _fjsp_setzero_v2r8();
2384 fjx1 = _fjsp_setzero_v2r8();
2385 fjy1 = _fjsp_setzero_v2r8();
2386 fjz1 = _fjsp_setzero_v2r8();
2387 fjx2 = _fjsp_setzero_v2r8();
2388 fjy2 = _fjsp_setzero_v2r8();
2389 fjz2 = _fjsp_setzero_v2r8();
2391 /**************************
2392 * CALCULATE INTERACTIONS *
2393 **************************/
2395 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
2398 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
2400 /* EWALD ELECTROSTATICS */
2402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2403 ewrt = _fjsp_mul_v2r8(r00,ewtabscale);
2404 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2405 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2406 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2408 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2409 ewtabD = _fjsp_setzero_v2r8();
2410 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2411 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2412 ewtabFn = _fjsp_setzero_v2r8();
2413 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2414 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2415 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2416 velec = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
2417 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
2419 /* LENNARD-JONES DISPERSION/REPULSION */
2421 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
2422 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
2423 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
2424 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
2425 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
2427 d = _fjsp_sub_v2r8(r00,rswitch);
2428 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2429 d2 = _fjsp_mul_v2r8(d,d);
2430 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2432 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2434 /* Evaluate switch function */
2435 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2436 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
2437 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
2438 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
2440 fscal = _fjsp_add_v2r8(felec,fvdw);
2442 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2444 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2446 /* Update vectorial force */
2447 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
2448 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
2449 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
2451 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
2452 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
2453 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
2457 /**************************
2458 * CALCULATE INTERACTIONS *
2459 **************************/
2461 if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
2464 r01 = _fjsp_mul_v2r8(rsq01,rinv01);
2466 /* EWALD ELECTROSTATICS */
2468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2469 ewrt = _fjsp_mul_v2r8(r01,ewtabscale);
2470 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2471 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2472 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2474 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2475 ewtabD = _fjsp_setzero_v2r8();
2476 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2477 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2478 ewtabFn = _fjsp_setzero_v2r8();
2479 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2480 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2481 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2482 velec = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
2483 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
2485 d = _fjsp_sub_v2r8(r01,rswitch);
2486 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2487 d2 = _fjsp_mul_v2r8(d,d);
2488 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2490 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2492 /* Evaluate switch function */
2493 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2494 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
2495 cutoff_mask = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
2499 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2501 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2503 /* Update vectorial force */
2504 fix0 = _fjsp_madd_v2r8(dx01,fscal,fix0);
2505 fiy0 = _fjsp_madd_v2r8(dy01,fscal,fiy0);
2506 fiz0 = _fjsp_madd_v2r8(dz01,fscal,fiz0);
2508 fjx1 = _fjsp_madd_v2r8(dx01,fscal,fjx1);
2509 fjy1 = _fjsp_madd_v2r8(dy01,fscal,fjy1);
2510 fjz1 = _fjsp_madd_v2r8(dz01,fscal,fjz1);
2514 /**************************
2515 * CALCULATE INTERACTIONS *
2516 **************************/
2518 if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
2521 r02 = _fjsp_mul_v2r8(rsq02,rinv02);
2523 /* EWALD ELECTROSTATICS */
2525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2526 ewrt = _fjsp_mul_v2r8(r02,ewtabscale);
2527 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2528 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2529 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2531 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2532 ewtabD = _fjsp_setzero_v2r8();
2533 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2534 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2535 ewtabFn = _fjsp_setzero_v2r8();
2536 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2537 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2538 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2539 velec = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
2540 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
2542 d = _fjsp_sub_v2r8(r02,rswitch);
2543 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2544 d2 = _fjsp_mul_v2r8(d,d);
2545 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2547 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2549 /* Evaluate switch function */
2550 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2551 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
2552 cutoff_mask = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
2556 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2558 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2560 /* Update vectorial force */
2561 fix0 = _fjsp_madd_v2r8(dx02,fscal,fix0);
2562 fiy0 = _fjsp_madd_v2r8(dy02,fscal,fiy0);
2563 fiz0 = _fjsp_madd_v2r8(dz02,fscal,fiz0);
2565 fjx2 = _fjsp_madd_v2r8(dx02,fscal,fjx2);
2566 fjy2 = _fjsp_madd_v2r8(dy02,fscal,fjy2);
2567 fjz2 = _fjsp_madd_v2r8(dz02,fscal,fjz2);
2571 /**************************
2572 * CALCULATE INTERACTIONS *
2573 **************************/
2575 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
2578 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
2580 /* EWALD ELECTROSTATICS */
2582 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2583 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
2584 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2585 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2586 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2588 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2589 ewtabD = _fjsp_setzero_v2r8();
2590 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2591 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2592 ewtabFn = _fjsp_setzero_v2r8();
2593 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2594 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2595 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2596 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
2597 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
2599 d = _fjsp_sub_v2r8(r10,rswitch);
2600 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2601 d2 = _fjsp_mul_v2r8(d,d);
2602 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2604 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2606 /* Evaluate switch function */
2607 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2608 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
2609 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
2613 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2615 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2617 /* Update vectorial force */
2618 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
2619 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
2620 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
2622 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
2623 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
2624 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
2628 /**************************
2629 * CALCULATE INTERACTIONS *
2630 **************************/
2632 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
2635 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
2637 /* EWALD ELECTROSTATICS */
2639 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2640 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
2641 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2642 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2643 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2645 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2646 ewtabD = _fjsp_setzero_v2r8();
2647 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2648 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2649 ewtabFn = _fjsp_setzero_v2r8();
2650 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2651 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2652 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2653 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
2654 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
2656 d = _fjsp_sub_v2r8(r11,rswitch);
2657 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2658 d2 = _fjsp_mul_v2r8(d,d);
2659 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2661 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2663 /* Evaluate switch function */
2664 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2665 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2666 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2670 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2672 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2674 /* Update vectorial force */
2675 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
2676 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2677 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2679 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2680 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2681 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2685 /**************************
2686 * CALCULATE INTERACTIONS *
2687 **************************/
2689 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2692 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
2694 /* EWALD ELECTROSTATICS */
2696 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2697 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
2698 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2699 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2700 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2702 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2703 ewtabD = _fjsp_setzero_v2r8();
2704 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2705 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2706 ewtabFn = _fjsp_setzero_v2r8();
2707 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2708 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2709 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2710 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2711 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2713 d = _fjsp_sub_v2r8(r12,rswitch);
2714 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2715 d2 = _fjsp_mul_v2r8(d,d);
2716 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2718 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2720 /* Evaluate switch function */
2721 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2722 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2723 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2727 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2729 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2731 /* Update vectorial force */
2732 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
2733 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2734 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2736 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2737 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2738 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2742 /**************************
2743 * CALCULATE INTERACTIONS *
2744 **************************/
2746 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
2749 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
2751 /* EWALD ELECTROSTATICS */
2753 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2754 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
2755 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2756 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2757 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2759 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2760 ewtabD = _fjsp_setzero_v2r8();
2761 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2762 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2763 ewtabFn = _fjsp_setzero_v2r8();
2764 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2765 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2766 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2767 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
2768 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
2770 d = _fjsp_sub_v2r8(r20,rswitch);
2771 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2772 d2 = _fjsp_mul_v2r8(d,d);
2773 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2775 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2777 /* Evaluate switch function */
2778 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2779 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
2780 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
2784 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2786 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2788 /* Update vectorial force */
2789 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
2790 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
2791 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
2793 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
2794 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
2795 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
2799 /**************************
2800 * CALCULATE INTERACTIONS *
2801 **************************/
2803 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2806 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
2808 /* EWALD ELECTROSTATICS */
2810 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2811 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
2812 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2813 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2814 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2816 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2817 ewtabD = _fjsp_setzero_v2r8();
2818 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2819 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2820 ewtabFn = _fjsp_setzero_v2r8();
2821 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2822 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2823 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2824 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2825 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2827 d = _fjsp_sub_v2r8(r21,rswitch);
2828 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2829 d2 = _fjsp_mul_v2r8(d,d);
2830 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2832 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2834 /* Evaluate switch function */
2835 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2836 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2837 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2841 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2843 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2845 /* Update vectorial force */
2846 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
2847 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2848 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2850 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2851 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2852 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2856 /**************************
2857 * CALCULATE INTERACTIONS *
2858 **************************/
2860 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2863 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
2865 /* EWALD ELECTROSTATICS */
2867 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2868 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
2869 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2870 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2871 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2873 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2874 ewtabD = _fjsp_setzero_v2r8();
2875 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2876 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2877 ewtabFn = _fjsp_setzero_v2r8();
2878 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2879 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2880 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2881 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2882 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2884 d = _fjsp_sub_v2r8(r22,rswitch);
2885 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2886 d2 = _fjsp_mul_v2r8(d,d);
2887 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2889 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2891 /* Evaluate switch function */
2892 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2893 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2894 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2898 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2900 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2902 /* Update vectorial force */
2903 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
2904 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2905 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2907 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2908 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2909 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2913 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2915 /* Inner loop uses 600 flops */
2918 /* End of innermost loop */
2920 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2921 f+i_coord_offset,fshift+i_shift_offset);
2923 /* Increment number of inner iterations */
2924 inneriter += j_index_end - j_index_start;
2926 /* Outer loop uses 18 flops */
2929 /* Increment number of outer iterations */
2932 /* Update outer/inner flops */
2934 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*600);