2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "kernelutil_sparc64_hpc_ace_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 _fjsp_v2r8 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 _fjsp_v2r8 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 _fjsp_v2r8 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 _fjsp_v2r8 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 _fjsp_v2r8 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87 int vdwjidx0A,vdwjidx0B;
88 _fjsp_v2r8 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 _fjsp_v2r8 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 _fjsp_v2r8 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B;
94 _fjsp_v2r8 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 _fjsp_v2r8 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 _fjsp_v2r8 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 _fjsp_v2r8 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 _fjsp_v2r8 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99 _fjsp_v2r8 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 _fjsp_v2r8 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 _fjsp_v2r8 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102 _fjsp_v2r8 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103 _fjsp_v2r8 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104 _fjsp_v2r8 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105 _fjsp_v2r8 velec,felec,velecsum,facel,crf,krf,krf2;
108 _fjsp_v2r8 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 _fjsp_v2r8 one_sixth = gmx_fjsp_set1_v2r8(1.0/6.0);
112 _fjsp_v2r8 one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
113 _fjsp_v2r8 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
115 _fjsp_v2r8 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116 real rswitch_scalar,d_scalar;
118 _fjsp_v2r8 dummy_mask,cutoff_mask;
119 _fjsp_v2r8 one = gmx_fjsp_set1_v2r8(1.0);
120 _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
121 union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
128 jindex = nlist->jindex;
130 shiftidx = nlist->shift;
132 shiftvec = fr->shift_vec[0];
133 fshift = fr->fshift[0];
134 facel = gmx_fjsp_set1_v2r8(fr->epsfac);
135 charge = mdatoms->chargeA;
136 nvdwtype = fr->ntype;
138 vdwtype = mdatoms->typeA;
140 sh_ewald = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
141 ewtab = fr->ic->tabq_coul_FDV0;
142 ewtabscale = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
143 ewtabhalfspace = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq1 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
148 iq2 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
149 iq3 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
152 jq1 = gmx_fjsp_set1_v2r8(charge[inr+1]);
153 jq2 = gmx_fjsp_set1_v2r8(charge[inr+2]);
154 jq3 = gmx_fjsp_set1_v2r8(charge[inr+3]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 c6_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
157 c12_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
158 qq11 = _fjsp_mul_v2r8(iq1,jq1);
159 qq12 = _fjsp_mul_v2r8(iq1,jq2);
160 qq13 = _fjsp_mul_v2r8(iq1,jq3);
161 qq21 = _fjsp_mul_v2r8(iq2,jq1);
162 qq22 = _fjsp_mul_v2r8(iq2,jq2);
163 qq23 = _fjsp_mul_v2r8(iq2,jq3);
164 qq31 = _fjsp_mul_v2r8(iq3,jq1);
165 qq32 = _fjsp_mul_v2r8(iq3,jq2);
166 qq33 = _fjsp_mul_v2r8(iq3,jq3);
168 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169 rcutoff_scalar = fr->rcoulomb;
170 rcutoff = gmx_fjsp_set1_v2r8(rcutoff_scalar);
171 rcutoff2 = _fjsp_mul_v2r8(rcutoff,rcutoff);
173 rswitch_scalar = fr->rcoulomb_switch;
174 rswitch = gmx_fjsp_set1_v2r8(rswitch_scalar);
175 /* Setup switch parameters */
176 d_scalar = rcutoff_scalar-rswitch_scalar;
177 d = gmx_fjsp_set1_v2r8(d_scalar);
178 swV3 = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
179 swV4 = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180 swV5 = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181 swF2 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
182 swF3 = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183 swF4 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
185 /* Avoid stupid compiler warnings */
193 /* Start outer loop over neighborlists */
194 for(iidx=0; iidx<nri; iidx++)
196 /* Load shift vector for this list */
197 i_shift_offset = DIM*shiftidx[iidx];
199 /* Load limits for loop over neighbors */
200 j_index_start = jindex[iidx];
201 j_index_end = jindex[iidx+1];
203 /* Get outer coordinate index */
205 i_coord_offset = DIM*inr;
207 /* Load i particle coords and add shift vector */
208 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
209 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
211 fix0 = _fjsp_setzero_v2r8();
212 fiy0 = _fjsp_setzero_v2r8();
213 fiz0 = _fjsp_setzero_v2r8();
214 fix1 = _fjsp_setzero_v2r8();
215 fiy1 = _fjsp_setzero_v2r8();
216 fiz1 = _fjsp_setzero_v2r8();
217 fix2 = _fjsp_setzero_v2r8();
218 fiy2 = _fjsp_setzero_v2r8();
219 fiz2 = _fjsp_setzero_v2r8();
220 fix3 = _fjsp_setzero_v2r8();
221 fiy3 = _fjsp_setzero_v2r8();
222 fiz3 = _fjsp_setzero_v2r8();
224 /* Reset potential sums */
225 velecsum = _fjsp_setzero_v2r8();
226 vvdwsum = _fjsp_setzero_v2r8();
228 /* Start inner kernel loop */
229 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
232 /* Get j neighbor index, and coordinate index */
235 j_coord_offsetA = DIM*jnrA;
236 j_coord_offsetB = DIM*jnrB;
238 /* load j atom coordinates */
239 gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
240 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241 &jy2,&jz2,&jx3,&jy3,&jz3);
243 /* Calculate displacement vector */
244 dx00 = _fjsp_sub_v2r8(ix0,jx0);
245 dy00 = _fjsp_sub_v2r8(iy0,jy0);
246 dz00 = _fjsp_sub_v2r8(iz0,jz0);
247 dx11 = _fjsp_sub_v2r8(ix1,jx1);
248 dy11 = _fjsp_sub_v2r8(iy1,jy1);
249 dz11 = _fjsp_sub_v2r8(iz1,jz1);
250 dx12 = _fjsp_sub_v2r8(ix1,jx2);
251 dy12 = _fjsp_sub_v2r8(iy1,jy2);
252 dz12 = _fjsp_sub_v2r8(iz1,jz2);
253 dx13 = _fjsp_sub_v2r8(ix1,jx3);
254 dy13 = _fjsp_sub_v2r8(iy1,jy3);
255 dz13 = _fjsp_sub_v2r8(iz1,jz3);
256 dx21 = _fjsp_sub_v2r8(ix2,jx1);
257 dy21 = _fjsp_sub_v2r8(iy2,jy1);
258 dz21 = _fjsp_sub_v2r8(iz2,jz1);
259 dx22 = _fjsp_sub_v2r8(ix2,jx2);
260 dy22 = _fjsp_sub_v2r8(iy2,jy2);
261 dz22 = _fjsp_sub_v2r8(iz2,jz2);
262 dx23 = _fjsp_sub_v2r8(ix2,jx3);
263 dy23 = _fjsp_sub_v2r8(iy2,jy3);
264 dz23 = _fjsp_sub_v2r8(iz2,jz3);
265 dx31 = _fjsp_sub_v2r8(ix3,jx1);
266 dy31 = _fjsp_sub_v2r8(iy3,jy1);
267 dz31 = _fjsp_sub_v2r8(iz3,jz1);
268 dx32 = _fjsp_sub_v2r8(ix3,jx2);
269 dy32 = _fjsp_sub_v2r8(iy3,jy2);
270 dz32 = _fjsp_sub_v2r8(iz3,jz2);
271 dx33 = _fjsp_sub_v2r8(ix3,jx3);
272 dy33 = _fjsp_sub_v2r8(iy3,jy3);
273 dz33 = _fjsp_sub_v2r8(iz3,jz3);
275 /* Calculate squared distance and things based on it */
276 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
277 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
278 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
279 rsq13 = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
280 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
281 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
282 rsq23 = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
283 rsq31 = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
284 rsq32 = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
285 rsq33 = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
287 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
288 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
289 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
290 rinv13 = gmx_fjsp_invsqrt_v2r8(rsq13);
291 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
292 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
293 rinv23 = gmx_fjsp_invsqrt_v2r8(rsq23);
294 rinv31 = gmx_fjsp_invsqrt_v2r8(rsq31);
295 rinv32 = gmx_fjsp_invsqrt_v2r8(rsq32);
296 rinv33 = gmx_fjsp_invsqrt_v2r8(rsq33);
298 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
299 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
300 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
301 rinvsq13 = _fjsp_mul_v2r8(rinv13,rinv13);
302 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
303 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
304 rinvsq23 = _fjsp_mul_v2r8(rinv23,rinv23);
305 rinvsq31 = _fjsp_mul_v2r8(rinv31,rinv31);
306 rinvsq32 = _fjsp_mul_v2r8(rinv32,rinv32);
307 rinvsq33 = _fjsp_mul_v2r8(rinv33,rinv33);
309 fjx0 = _fjsp_setzero_v2r8();
310 fjy0 = _fjsp_setzero_v2r8();
311 fjz0 = _fjsp_setzero_v2r8();
312 fjx1 = _fjsp_setzero_v2r8();
313 fjy1 = _fjsp_setzero_v2r8();
314 fjz1 = _fjsp_setzero_v2r8();
315 fjx2 = _fjsp_setzero_v2r8();
316 fjy2 = _fjsp_setzero_v2r8();
317 fjz2 = _fjsp_setzero_v2r8();
318 fjx3 = _fjsp_setzero_v2r8();
319 fjy3 = _fjsp_setzero_v2r8();
320 fjz3 = _fjsp_setzero_v2r8();
322 /**************************
323 * CALCULATE INTERACTIONS *
324 **************************/
326 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
329 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
331 /* LENNARD-JONES DISPERSION/REPULSION */
333 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
334 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
335 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
336 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
337 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
339 d = _fjsp_sub_v2r8(r00,rswitch);
340 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
341 d2 = _fjsp_mul_v2r8(d,d);
342 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
344 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
346 /* Evaluate switch function */
347 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
349 vvdw = _fjsp_mul_v2r8(vvdw,sw);
350 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 vvdw = _fjsp_and_v2r8(vvdw,cutoff_mask);
354 vvdwsum = _fjsp_add_v2r8(vvdwsum,vvdw);
358 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
360 /* Update vectorial force */
361 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
362 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
363 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
365 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
366 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
367 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
378 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
380 /* EWALD ELECTROSTATICS */
382 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
383 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
384 itab_tmp = _fjsp_dtox_v2r8(ewrt);
385 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
386 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
388 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
389 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
390 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
391 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
392 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
393 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
394 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
395 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
396 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
397 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
399 d = _fjsp_sub_v2r8(r11,rswitch);
400 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
401 d2 = _fjsp_mul_v2r8(d,d);
402 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
404 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
406 /* Evaluate switch function */
407 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
408 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
409 velec = _fjsp_mul_v2r8(velec,sw);
410 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
412 /* Update potential sum for this i atom from the interaction with this j atom. */
413 velec = _fjsp_and_v2r8(velec,cutoff_mask);
414 velecsum = _fjsp_add_v2r8(velecsum,velec);
418 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
420 /* Update vectorial force */
421 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
422 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
423 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
425 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
426 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
427 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
431 /**************************
432 * CALCULATE INTERACTIONS *
433 **************************/
435 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
438 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
440 /* EWALD ELECTROSTATICS */
442 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
443 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
444 itab_tmp = _fjsp_dtox_v2r8(ewrt);
445 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
446 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
448 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
449 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
450 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
451 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
452 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
453 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
454 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
455 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
456 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
457 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
459 d = _fjsp_sub_v2r8(r12,rswitch);
460 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
461 d2 = _fjsp_mul_v2r8(d,d);
462 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
464 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
466 /* Evaluate switch function */
467 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
468 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
469 velec = _fjsp_mul_v2r8(velec,sw);
470 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
472 /* Update potential sum for this i atom from the interaction with this j atom. */
473 velec = _fjsp_and_v2r8(velec,cutoff_mask);
474 velecsum = _fjsp_add_v2r8(velecsum,velec);
478 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
480 /* Update vectorial force */
481 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
482 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
483 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
485 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
486 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
487 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
498 r13 = _fjsp_mul_v2r8(rsq13,rinv13);
500 /* EWALD ELECTROSTATICS */
502 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
503 ewrt = _fjsp_mul_v2r8(r13,ewtabscale);
504 itab_tmp = _fjsp_dtox_v2r8(ewrt);
505 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
506 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
508 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
509 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
510 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
511 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
512 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
513 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
514 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
515 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
516 velec = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
517 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
519 d = _fjsp_sub_v2r8(r13,rswitch);
520 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
521 d2 = _fjsp_mul_v2r8(d,d);
522 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
524 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
526 /* Evaluate switch function */
527 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
528 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
529 velec = _fjsp_mul_v2r8(velec,sw);
530 cutoff_mask = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
532 /* Update potential sum for this i atom from the interaction with this j atom. */
533 velec = _fjsp_and_v2r8(velec,cutoff_mask);
534 velecsum = _fjsp_add_v2r8(velecsum,velec);
538 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
540 /* Update vectorial force */
541 fix1 = _fjsp_madd_v2r8(dx13,fscal,fix1);
542 fiy1 = _fjsp_madd_v2r8(dy13,fscal,fiy1);
543 fiz1 = _fjsp_madd_v2r8(dz13,fscal,fiz1);
545 fjx3 = _fjsp_madd_v2r8(dx13,fscal,fjx3);
546 fjy3 = _fjsp_madd_v2r8(dy13,fscal,fjy3);
547 fjz3 = _fjsp_madd_v2r8(dz13,fscal,fjz3);
551 /**************************
552 * CALCULATE INTERACTIONS *
553 **************************/
555 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
558 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
560 /* EWALD ELECTROSTATICS */
562 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
563 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
564 itab_tmp = _fjsp_dtox_v2r8(ewrt);
565 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
566 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
568 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
569 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
570 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
571 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
572 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
573 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
574 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
575 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
576 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
577 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
579 d = _fjsp_sub_v2r8(r21,rswitch);
580 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
581 d2 = _fjsp_mul_v2r8(d,d);
582 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
584 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
586 /* Evaluate switch function */
587 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
588 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
589 velec = _fjsp_mul_v2r8(velec,sw);
590 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
592 /* Update potential sum for this i atom from the interaction with this j atom. */
593 velec = _fjsp_and_v2r8(velec,cutoff_mask);
594 velecsum = _fjsp_add_v2r8(velecsum,velec);
598 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
600 /* Update vectorial force */
601 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
602 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
603 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
605 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
606 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
607 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
611 /**************************
612 * CALCULATE INTERACTIONS *
613 **************************/
615 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
618 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
620 /* EWALD ELECTROSTATICS */
622 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
623 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
624 itab_tmp = _fjsp_dtox_v2r8(ewrt);
625 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
626 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
628 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
629 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
630 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
631 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
632 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
633 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
634 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
635 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
636 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
637 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
639 d = _fjsp_sub_v2r8(r22,rswitch);
640 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
641 d2 = _fjsp_mul_v2r8(d,d);
642 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
644 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
646 /* Evaluate switch function */
647 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
648 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
649 velec = _fjsp_mul_v2r8(velec,sw);
650 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velec = _fjsp_and_v2r8(velec,cutoff_mask);
654 velecsum = _fjsp_add_v2r8(velecsum,velec);
658 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
660 /* Update vectorial force */
661 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
662 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
663 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
665 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
666 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
667 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
671 /**************************
672 * CALCULATE INTERACTIONS *
673 **************************/
675 if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
678 r23 = _fjsp_mul_v2r8(rsq23,rinv23);
680 /* EWALD ELECTROSTATICS */
682 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
683 ewrt = _fjsp_mul_v2r8(r23,ewtabscale);
684 itab_tmp = _fjsp_dtox_v2r8(ewrt);
685 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
686 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
688 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
689 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
690 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
691 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
692 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
693 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
694 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
695 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
696 velec = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
697 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
699 d = _fjsp_sub_v2r8(r23,rswitch);
700 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
701 d2 = _fjsp_mul_v2r8(d,d);
702 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
704 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
706 /* Evaluate switch function */
707 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
708 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
709 velec = _fjsp_mul_v2r8(velec,sw);
710 cutoff_mask = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
712 /* Update potential sum for this i atom from the interaction with this j atom. */
713 velec = _fjsp_and_v2r8(velec,cutoff_mask);
714 velecsum = _fjsp_add_v2r8(velecsum,velec);
718 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
720 /* Update vectorial force */
721 fix2 = _fjsp_madd_v2r8(dx23,fscal,fix2);
722 fiy2 = _fjsp_madd_v2r8(dy23,fscal,fiy2);
723 fiz2 = _fjsp_madd_v2r8(dz23,fscal,fiz2);
725 fjx3 = _fjsp_madd_v2r8(dx23,fscal,fjx3);
726 fjy3 = _fjsp_madd_v2r8(dy23,fscal,fjy3);
727 fjz3 = _fjsp_madd_v2r8(dz23,fscal,fjz3);
731 /**************************
732 * CALCULATE INTERACTIONS *
733 **************************/
735 if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
738 r31 = _fjsp_mul_v2r8(rsq31,rinv31);
740 /* EWALD ELECTROSTATICS */
742 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
743 ewrt = _fjsp_mul_v2r8(r31,ewtabscale);
744 itab_tmp = _fjsp_dtox_v2r8(ewrt);
745 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
746 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
748 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
749 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
750 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
751 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
752 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
753 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
754 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
755 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
756 velec = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
757 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
759 d = _fjsp_sub_v2r8(r31,rswitch);
760 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
761 d2 = _fjsp_mul_v2r8(d,d);
762 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
764 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
766 /* Evaluate switch function */
767 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
768 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
769 velec = _fjsp_mul_v2r8(velec,sw);
770 cutoff_mask = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
772 /* Update potential sum for this i atom from the interaction with this j atom. */
773 velec = _fjsp_and_v2r8(velec,cutoff_mask);
774 velecsum = _fjsp_add_v2r8(velecsum,velec);
778 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
780 /* Update vectorial force */
781 fix3 = _fjsp_madd_v2r8(dx31,fscal,fix3);
782 fiy3 = _fjsp_madd_v2r8(dy31,fscal,fiy3);
783 fiz3 = _fjsp_madd_v2r8(dz31,fscal,fiz3);
785 fjx1 = _fjsp_madd_v2r8(dx31,fscal,fjx1);
786 fjy1 = _fjsp_madd_v2r8(dy31,fscal,fjy1);
787 fjz1 = _fjsp_madd_v2r8(dz31,fscal,fjz1);
791 /**************************
792 * CALCULATE INTERACTIONS *
793 **************************/
795 if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
798 r32 = _fjsp_mul_v2r8(rsq32,rinv32);
800 /* EWALD ELECTROSTATICS */
802 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
803 ewrt = _fjsp_mul_v2r8(r32,ewtabscale);
804 itab_tmp = _fjsp_dtox_v2r8(ewrt);
805 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
806 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
808 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
809 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
810 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
811 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
812 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
813 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
814 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
815 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
816 velec = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
817 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
819 d = _fjsp_sub_v2r8(r32,rswitch);
820 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
821 d2 = _fjsp_mul_v2r8(d,d);
822 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
824 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
826 /* Evaluate switch function */
827 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
828 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
829 velec = _fjsp_mul_v2r8(velec,sw);
830 cutoff_mask = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
832 /* Update potential sum for this i atom from the interaction with this j atom. */
833 velec = _fjsp_and_v2r8(velec,cutoff_mask);
834 velecsum = _fjsp_add_v2r8(velecsum,velec);
838 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
840 /* Update vectorial force */
841 fix3 = _fjsp_madd_v2r8(dx32,fscal,fix3);
842 fiy3 = _fjsp_madd_v2r8(dy32,fscal,fiy3);
843 fiz3 = _fjsp_madd_v2r8(dz32,fscal,fiz3);
845 fjx2 = _fjsp_madd_v2r8(dx32,fscal,fjx2);
846 fjy2 = _fjsp_madd_v2r8(dy32,fscal,fjy2);
847 fjz2 = _fjsp_madd_v2r8(dz32,fscal,fjz2);
851 /**************************
852 * CALCULATE INTERACTIONS *
853 **************************/
855 if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
858 r33 = _fjsp_mul_v2r8(rsq33,rinv33);
860 /* EWALD ELECTROSTATICS */
862 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
863 ewrt = _fjsp_mul_v2r8(r33,ewtabscale);
864 itab_tmp = _fjsp_dtox_v2r8(ewrt);
865 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
866 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
868 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
869 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
870 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
871 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
872 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
873 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
874 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
875 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
876 velec = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
877 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
879 d = _fjsp_sub_v2r8(r33,rswitch);
880 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
881 d2 = _fjsp_mul_v2r8(d,d);
882 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
884 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
886 /* Evaluate switch function */
887 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
888 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
889 velec = _fjsp_mul_v2r8(velec,sw);
890 cutoff_mask = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
892 /* Update potential sum for this i atom from the interaction with this j atom. */
893 velec = _fjsp_and_v2r8(velec,cutoff_mask);
894 velecsum = _fjsp_add_v2r8(velecsum,velec);
898 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
900 /* Update vectorial force */
901 fix3 = _fjsp_madd_v2r8(dx33,fscal,fix3);
902 fiy3 = _fjsp_madd_v2r8(dy33,fscal,fiy3);
903 fiz3 = _fjsp_madd_v2r8(dz33,fscal,fiz3);
905 fjx3 = _fjsp_madd_v2r8(dx33,fscal,fjx3);
906 fjy3 = _fjsp_madd_v2r8(dy33,fscal,fjy3);
907 fjz3 = _fjsp_madd_v2r8(dz33,fscal,fjz3);
911 gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
913 /* Inner loop uses 677 flops */
920 j_coord_offsetA = DIM*jnrA;
922 /* load j atom coordinates */
923 gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
924 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
925 &jy2,&jz2,&jx3,&jy3,&jz3);
927 /* Calculate displacement vector */
928 dx00 = _fjsp_sub_v2r8(ix0,jx0);
929 dy00 = _fjsp_sub_v2r8(iy0,jy0);
930 dz00 = _fjsp_sub_v2r8(iz0,jz0);
931 dx11 = _fjsp_sub_v2r8(ix1,jx1);
932 dy11 = _fjsp_sub_v2r8(iy1,jy1);
933 dz11 = _fjsp_sub_v2r8(iz1,jz1);
934 dx12 = _fjsp_sub_v2r8(ix1,jx2);
935 dy12 = _fjsp_sub_v2r8(iy1,jy2);
936 dz12 = _fjsp_sub_v2r8(iz1,jz2);
937 dx13 = _fjsp_sub_v2r8(ix1,jx3);
938 dy13 = _fjsp_sub_v2r8(iy1,jy3);
939 dz13 = _fjsp_sub_v2r8(iz1,jz3);
940 dx21 = _fjsp_sub_v2r8(ix2,jx1);
941 dy21 = _fjsp_sub_v2r8(iy2,jy1);
942 dz21 = _fjsp_sub_v2r8(iz2,jz1);
943 dx22 = _fjsp_sub_v2r8(ix2,jx2);
944 dy22 = _fjsp_sub_v2r8(iy2,jy2);
945 dz22 = _fjsp_sub_v2r8(iz2,jz2);
946 dx23 = _fjsp_sub_v2r8(ix2,jx3);
947 dy23 = _fjsp_sub_v2r8(iy2,jy3);
948 dz23 = _fjsp_sub_v2r8(iz2,jz3);
949 dx31 = _fjsp_sub_v2r8(ix3,jx1);
950 dy31 = _fjsp_sub_v2r8(iy3,jy1);
951 dz31 = _fjsp_sub_v2r8(iz3,jz1);
952 dx32 = _fjsp_sub_v2r8(ix3,jx2);
953 dy32 = _fjsp_sub_v2r8(iy3,jy2);
954 dz32 = _fjsp_sub_v2r8(iz3,jz2);
955 dx33 = _fjsp_sub_v2r8(ix3,jx3);
956 dy33 = _fjsp_sub_v2r8(iy3,jy3);
957 dz33 = _fjsp_sub_v2r8(iz3,jz3);
959 /* Calculate squared distance and things based on it */
960 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
961 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
962 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
963 rsq13 = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
964 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
965 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
966 rsq23 = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
967 rsq31 = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
968 rsq32 = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
969 rsq33 = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
971 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
972 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
973 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
974 rinv13 = gmx_fjsp_invsqrt_v2r8(rsq13);
975 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
976 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
977 rinv23 = gmx_fjsp_invsqrt_v2r8(rsq23);
978 rinv31 = gmx_fjsp_invsqrt_v2r8(rsq31);
979 rinv32 = gmx_fjsp_invsqrt_v2r8(rsq32);
980 rinv33 = gmx_fjsp_invsqrt_v2r8(rsq33);
982 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
983 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
984 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
985 rinvsq13 = _fjsp_mul_v2r8(rinv13,rinv13);
986 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
987 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
988 rinvsq23 = _fjsp_mul_v2r8(rinv23,rinv23);
989 rinvsq31 = _fjsp_mul_v2r8(rinv31,rinv31);
990 rinvsq32 = _fjsp_mul_v2r8(rinv32,rinv32);
991 rinvsq33 = _fjsp_mul_v2r8(rinv33,rinv33);
993 fjx0 = _fjsp_setzero_v2r8();
994 fjy0 = _fjsp_setzero_v2r8();
995 fjz0 = _fjsp_setzero_v2r8();
996 fjx1 = _fjsp_setzero_v2r8();
997 fjy1 = _fjsp_setzero_v2r8();
998 fjz1 = _fjsp_setzero_v2r8();
999 fjx2 = _fjsp_setzero_v2r8();
1000 fjy2 = _fjsp_setzero_v2r8();
1001 fjz2 = _fjsp_setzero_v2r8();
1002 fjx3 = _fjsp_setzero_v2r8();
1003 fjy3 = _fjsp_setzero_v2r8();
1004 fjz3 = _fjsp_setzero_v2r8();
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1013 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
1015 /* LENNARD-JONES DISPERSION/REPULSION */
1017 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1018 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
1019 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
1020 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
1021 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
1023 d = _fjsp_sub_v2r8(r00,rswitch);
1024 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1025 d2 = _fjsp_mul_v2r8(d,d);
1026 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1028 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1030 /* Evaluate switch function */
1031 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1032 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
1033 vvdw = _fjsp_mul_v2r8(vvdw,sw);
1034 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1036 /* Update potential sum for this i atom from the interaction with this j atom. */
1037 vvdw = _fjsp_and_v2r8(vvdw,cutoff_mask);
1038 vvdw = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
1039 vvdwsum = _fjsp_add_v2r8(vvdwsum,vvdw);
1043 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1045 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1047 /* Update vectorial force */
1048 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
1049 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1050 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1052 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1053 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1054 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1058 /**************************
1059 * CALCULATE INTERACTIONS *
1060 **************************/
1062 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1065 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
1067 /* EWALD ELECTROSTATICS */
1069 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1070 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
1071 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1072 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1073 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1075 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1076 ewtabD = _fjsp_setzero_v2r8();
1077 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1078 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1079 ewtabFn = _fjsp_setzero_v2r8();
1080 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1081 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1082 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1083 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
1084 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1086 d = _fjsp_sub_v2r8(r11,rswitch);
1087 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1088 d2 = _fjsp_mul_v2r8(d,d);
1089 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1091 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1093 /* Evaluate switch function */
1094 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1095 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
1096 velec = _fjsp_mul_v2r8(velec,sw);
1097 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1099 /* Update potential sum for this i atom from the interaction with this j atom. */
1100 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1101 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1102 velecsum = _fjsp_add_v2r8(velecsum,velec);
1106 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1108 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1110 /* Update vectorial force */
1111 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
1112 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1113 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1115 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1116 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1117 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1121 /**************************
1122 * CALCULATE INTERACTIONS *
1123 **************************/
1125 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
1128 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
1130 /* EWALD ELECTROSTATICS */
1132 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1133 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
1134 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1135 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1136 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1138 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1139 ewtabD = _fjsp_setzero_v2r8();
1140 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1141 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1142 ewtabFn = _fjsp_setzero_v2r8();
1143 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1144 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1145 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1146 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
1147 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1149 d = _fjsp_sub_v2r8(r12,rswitch);
1150 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1151 d2 = _fjsp_mul_v2r8(d,d);
1152 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1154 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1156 /* Evaluate switch function */
1157 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1158 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
1159 velec = _fjsp_mul_v2r8(velec,sw);
1160 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
1162 /* Update potential sum for this i atom from the interaction with this j atom. */
1163 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1164 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1165 velecsum = _fjsp_add_v2r8(velecsum,velec);
1169 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1171 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1173 /* Update vectorial force */
1174 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
1175 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1176 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1178 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1179 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1180 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1184 /**************************
1185 * CALCULATE INTERACTIONS *
1186 **************************/
1188 if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
1191 r13 = _fjsp_mul_v2r8(rsq13,rinv13);
1193 /* EWALD ELECTROSTATICS */
1195 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1196 ewrt = _fjsp_mul_v2r8(r13,ewtabscale);
1197 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1198 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1199 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1201 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1202 ewtabD = _fjsp_setzero_v2r8();
1203 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1204 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1205 ewtabFn = _fjsp_setzero_v2r8();
1206 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1207 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1208 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1209 velec = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
1210 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
1212 d = _fjsp_sub_v2r8(r13,rswitch);
1213 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1214 d2 = _fjsp_mul_v2r8(d,d);
1215 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1217 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1219 /* Evaluate switch function */
1220 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1221 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
1222 velec = _fjsp_mul_v2r8(velec,sw);
1223 cutoff_mask = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
1225 /* Update potential sum for this i atom from the interaction with this j atom. */
1226 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1227 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1228 velecsum = _fjsp_add_v2r8(velecsum,velec);
1232 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1234 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1236 /* Update vectorial force */
1237 fix1 = _fjsp_madd_v2r8(dx13,fscal,fix1);
1238 fiy1 = _fjsp_madd_v2r8(dy13,fscal,fiy1);
1239 fiz1 = _fjsp_madd_v2r8(dz13,fscal,fiz1);
1241 fjx3 = _fjsp_madd_v2r8(dx13,fscal,fjx3);
1242 fjy3 = _fjsp_madd_v2r8(dy13,fscal,fjy3);
1243 fjz3 = _fjsp_madd_v2r8(dz13,fscal,fjz3);
1247 /**************************
1248 * CALCULATE INTERACTIONS *
1249 **************************/
1251 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
1254 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
1256 /* EWALD ELECTROSTATICS */
1258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1259 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
1260 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1261 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1262 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1264 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1265 ewtabD = _fjsp_setzero_v2r8();
1266 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1267 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1268 ewtabFn = _fjsp_setzero_v2r8();
1269 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1270 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1271 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1272 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
1273 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1275 d = _fjsp_sub_v2r8(r21,rswitch);
1276 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1277 d2 = _fjsp_mul_v2r8(d,d);
1278 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1280 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1282 /* Evaluate switch function */
1283 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1284 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
1285 velec = _fjsp_mul_v2r8(velec,sw);
1286 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
1288 /* Update potential sum for this i atom from the interaction with this j atom. */
1289 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1290 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1291 velecsum = _fjsp_add_v2r8(velecsum,velec);
1295 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1297 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1299 /* Update vectorial force */
1300 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
1301 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1302 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1304 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1305 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1306 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1310 /**************************
1311 * CALCULATE INTERACTIONS *
1312 **************************/
1314 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1317 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
1319 /* EWALD ELECTROSTATICS */
1321 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1322 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
1323 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1324 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1325 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1327 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1328 ewtabD = _fjsp_setzero_v2r8();
1329 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1330 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1331 ewtabFn = _fjsp_setzero_v2r8();
1332 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1333 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1334 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1335 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
1336 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1338 d = _fjsp_sub_v2r8(r22,rswitch);
1339 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1340 d2 = _fjsp_mul_v2r8(d,d);
1341 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1343 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1345 /* Evaluate switch function */
1346 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1347 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
1348 velec = _fjsp_mul_v2r8(velec,sw);
1349 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1351 /* Update potential sum for this i atom from the interaction with this j atom. */
1352 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1353 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1354 velecsum = _fjsp_add_v2r8(velecsum,velec);
1358 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1360 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1362 /* Update vectorial force */
1363 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
1364 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1365 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1367 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1368 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1369 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1373 /**************************
1374 * CALCULATE INTERACTIONS *
1375 **************************/
1377 if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
1380 r23 = _fjsp_mul_v2r8(rsq23,rinv23);
1382 /* EWALD ELECTROSTATICS */
1384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1385 ewrt = _fjsp_mul_v2r8(r23,ewtabscale);
1386 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1387 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1388 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1390 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1391 ewtabD = _fjsp_setzero_v2r8();
1392 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1393 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1394 ewtabFn = _fjsp_setzero_v2r8();
1395 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1396 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1397 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1398 velec = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
1399 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
1401 d = _fjsp_sub_v2r8(r23,rswitch);
1402 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1403 d2 = _fjsp_mul_v2r8(d,d);
1404 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1406 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1408 /* Evaluate switch function */
1409 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1410 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
1411 velec = _fjsp_mul_v2r8(velec,sw);
1412 cutoff_mask = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
1414 /* Update potential sum for this i atom from the interaction with this j atom. */
1415 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1416 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1417 velecsum = _fjsp_add_v2r8(velecsum,velec);
1421 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1423 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1425 /* Update vectorial force */
1426 fix2 = _fjsp_madd_v2r8(dx23,fscal,fix2);
1427 fiy2 = _fjsp_madd_v2r8(dy23,fscal,fiy2);
1428 fiz2 = _fjsp_madd_v2r8(dz23,fscal,fiz2);
1430 fjx3 = _fjsp_madd_v2r8(dx23,fscal,fjx3);
1431 fjy3 = _fjsp_madd_v2r8(dy23,fscal,fjy3);
1432 fjz3 = _fjsp_madd_v2r8(dz23,fscal,fjz3);
1436 /**************************
1437 * CALCULATE INTERACTIONS *
1438 **************************/
1440 if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
1443 r31 = _fjsp_mul_v2r8(rsq31,rinv31);
1445 /* EWALD ELECTROSTATICS */
1447 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1448 ewrt = _fjsp_mul_v2r8(r31,ewtabscale);
1449 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1450 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1451 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1453 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1454 ewtabD = _fjsp_setzero_v2r8();
1455 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1456 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1457 ewtabFn = _fjsp_setzero_v2r8();
1458 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1459 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1460 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1461 velec = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
1462 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
1464 d = _fjsp_sub_v2r8(r31,rswitch);
1465 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1466 d2 = _fjsp_mul_v2r8(d,d);
1467 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1469 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1471 /* Evaluate switch function */
1472 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1473 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
1474 velec = _fjsp_mul_v2r8(velec,sw);
1475 cutoff_mask = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
1477 /* Update potential sum for this i atom from the interaction with this j atom. */
1478 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1479 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1480 velecsum = _fjsp_add_v2r8(velecsum,velec);
1484 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1486 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1488 /* Update vectorial force */
1489 fix3 = _fjsp_madd_v2r8(dx31,fscal,fix3);
1490 fiy3 = _fjsp_madd_v2r8(dy31,fscal,fiy3);
1491 fiz3 = _fjsp_madd_v2r8(dz31,fscal,fiz3);
1493 fjx1 = _fjsp_madd_v2r8(dx31,fscal,fjx1);
1494 fjy1 = _fjsp_madd_v2r8(dy31,fscal,fjy1);
1495 fjz1 = _fjsp_madd_v2r8(dz31,fscal,fjz1);
1499 /**************************
1500 * CALCULATE INTERACTIONS *
1501 **************************/
1503 if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
1506 r32 = _fjsp_mul_v2r8(rsq32,rinv32);
1508 /* EWALD ELECTROSTATICS */
1510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1511 ewrt = _fjsp_mul_v2r8(r32,ewtabscale);
1512 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1513 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1514 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1516 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1517 ewtabD = _fjsp_setzero_v2r8();
1518 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1519 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1520 ewtabFn = _fjsp_setzero_v2r8();
1521 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1522 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1523 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1524 velec = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
1525 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
1527 d = _fjsp_sub_v2r8(r32,rswitch);
1528 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1529 d2 = _fjsp_mul_v2r8(d,d);
1530 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1532 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1534 /* Evaluate switch function */
1535 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1536 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
1537 velec = _fjsp_mul_v2r8(velec,sw);
1538 cutoff_mask = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
1540 /* Update potential sum for this i atom from the interaction with this j atom. */
1541 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1542 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1543 velecsum = _fjsp_add_v2r8(velecsum,velec);
1547 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1549 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1551 /* Update vectorial force */
1552 fix3 = _fjsp_madd_v2r8(dx32,fscal,fix3);
1553 fiy3 = _fjsp_madd_v2r8(dy32,fscal,fiy3);
1554 fiz3 = _fjsp_madd_v2r8(dz32,fscal,fiz3);
1556 fjx2 = _fjsp_madd_v2r8(dx32,fscal,fjx2);
1557 fjy2 = _fjsp_madd_v2r8(dy32,fscal,fjy2);
1558 fjz2 = _fjsp_madd_v2r8(dz32,fscal,fjz2);
1562 /**************************
1563 * CALCULATE INTERACTIONS *
1564 **************************/
1566 if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
1569 r33 = _fjsp_mul_v2r8(rsq33,rinv33);
1571 /* EWALD ELECTROSTATICS */
1573 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1574 ewrt = _fjsp_mul_v2r8(r33,ewtabscale);
1575 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1576 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1577 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1579 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1580 ewtabD = _fjsp_setzero_v2r8();
1581 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1582 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1583 ewtabFn = _fjsp_setzero_v2r8();
1584 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1585 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1586 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1587 velec = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
1588 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
1590 d = _fjsp_sub_v2r8(r33,rswitch);
1591 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1592 d2 = _fjsp_mul_v2r8(d,d);
1593 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1595 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1597 /* Evaluate switch function */
1598 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1599 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
1600 velec = _fjsp_mul_v2r8(velec,sw);
1601 cutoff_mask = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
1603 /* Update potential sum for this i atom from the interaction with this j atom. */
1604 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1605 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1606 velecsum = _fjsp_add_v2r8(velecsum,velec);
1610 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1612 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1614 /* Update vectorial force */
1615 fix3 = _fjsp_madd_v2r8(dx33,fscal,fix3);
1616 fiy3 = _fjsp_madd_v2r8(dy33,fscal,fiy3);
1617 fiz3 = _fjsp_madd_v2r8(dz33,fscal,fiz3);
1619 fjx3 = _fjsp_madd_v2r8(dx33,fscal,fjx3);
1620 fjy3 = _fjsp_madd_v2r8(dy33,fscal,fjy3);
1621 fjz3 = _fjsp_madd_v2r8(dz33,fscal,fjz3);
1625 gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1627 /* Inner loop uses 677 flops */
1630 /* End of innermost loop */
1632 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1633 f+i_coord_offset,fshift+i_shift_offset);
1636 /* Update potential energies */
1637 gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1638 gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
1640 /* Increment number of inner iterations */
1641 inneriter += j_index_end - j_index_start;
1643 /* Outer loop uses 26 flops */
1646 /* Increment number of outer iterations */
1649 /* Update outer/inner flops */
1651 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*677);
1654 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
1655 * Electrostatics interaction: Ewald
1656 * VdW interaction: LennardJones
1657 * Geometry: Water4-Water4
1658 * Calculate force/pot: Force
1661 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
1662 (t_nblist * gmx_restrict nlist,
1663 rvec * gmx_restrict xx,
1664 rvec * gmx_restrict ff,
1665 t_forcerec * gmx_restrict fr,
1666 t_mdatoms * gmx_restrict mdatoms,
1667 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1668 t_nrnb * gmx_restrict nrnb)
1670 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1671 * just 0 for non-waters.
1672 * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1673 * jnr indices corresponding to data put in the four positions in the SIMD register.
1675 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1676 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1678 int j_coord_offsetA,j_coord_offsetB;
1679 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1680 real rcutoff_scalar;
1681 real *shiftvec,*fshift,*x,*f;
1682 _fjsp_v2r8 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1684 _fjsp_v2r8 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1686 _fjsp_v2r8 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1688 _fjsp_v2r8 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1690 _fjsp_v2r8 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1691 int vdwjidx0A,vdwjidx0B;
1692 _fjsp_v2r8 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1693 int vdwjidx1A,vdwjidx1B;
1694 _fjsp_v2r8 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1695 int vdwjidx2A,vdwjidx2B;
1696 _fjsp_v2r8 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1697 int vdwjidx3A,vdwjidx3B;
1698 _fjsp_v2r8 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1699 _fjsp_v2r8 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1700 _fjsp_v2r8 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1701 _fjsp_v2r8 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1702 _fjsp_v2r8 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1703 _fjsp_v2r8 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1704 _fjsp_v2r8 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1705 _fjsp_v2r8 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1706 _fjsp_v2r8 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1707 _fjsp_v2r8 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1708 _fjsp_v2r8 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1709 _fjsp_v2r8 velec,felec,velecsum,facel,crf,krf,krf2;
1712 _fjsp_v2r8 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1715 _fjsp_v2r8 one_sixth = gmx_fjsp_set1_v2r8(1.0/6.0);
1716 _fjsp_v2r8 one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
1717 _fjsp_v2r8 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1719 _fjsp_v2r8 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1720 real rswitch_scalar,d_scalar;
1721 _fjsp_v2r8 itab_tmp;
1722 _fjsp_v2r8 dummy_mask,cutoff_mask;
1723 _fjsp_v2r8 one = gmx_fjsp_set1_v2r8(1.0);
1724 _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
1725 union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1732 jindex = nlist->jindex;
1734 shiftidx = nlist->shift;
1736 shiftvec = fr->shift_vec[0];
1737 fshift = fr->fshift[0];
1738 facel = gmx_fjsp_set1_v2r8(fr->epsfac);
1739 charge = mdatoms->chargeA;
1740 nvdwtype = fr->ntype;
1741 vdwparam = fr->nbfp;
1742 vdwtype = mdatoms->typeA;
1744 sh_ewald = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1745 ewtab = fr->ic->tabq_coul_FDV0;
1746 ewtabscale = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1747 ewtabhalfspace = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1749 /* Setup water-specific parameters */
1750 inr = nlist->iinr[0];
1751 iq1 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1752 iq2 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1753 iq3 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
1754 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1756 jq1 = gmx_fjsp_set1_v2r8(charge[inr+1]);
1757 jq2 = gmx_fjsp_set1_v2r8(charge[inr+2]);
1758 jq3 = gmx_fjsp_set1_v2r8(charge[inr+3]);
1759 vdwjidx0A = 2*vdwtype[inr+0];
1760 c6_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1761 c12_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1762 qq11 = _fjsp_mul_v2r8(iq1,jq1);
1763 qq12 = _fjsp_mul_v2r8(iq1,jq2);
1764 qq13 = _fjsp_mul_v2r8(iq1,jq3);
1765 qq21 = _fjsp_mul_v2r8(iq2,jq1);
1766 qq22 = _fjsp_mul_v2r8(iq2,jq2);
1767 qq23 = _fjsp_mul_v2r8(iq2,jq3);
1768 qq31 = _fjsp_mul_v2r8(iq3,jq1);
1769 qq32 = _fjsp_mul_v2r8(iq3,jq2);
1770 qq33 = _fjsp_mul_v2r8(iq3,jq3);
1772 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1773 rcutoff_scalar = fr->rcoulomb;
1774 rcutoff = gmx_fjsp_set1_v2r8(rcutoff_scalar);
1775 rcutoff2 = _fjsp_mul_v2r8(rcutoff,rcutoff);
1777 rswitch_scalar = fr->rcoulomb_switch;
1778 rswitch = gmx_fjsp_set1_v2r8(rswitch_scalar);
1779 /* Setup switch parameters */
1780 d_scalar = rcutoff_scalar-rswitch_scalar;
1781 d = gmx_fjsp_set1_v2r8(d_scalar);
1782 swV3 = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
1783 swV4 = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1784 swV5 = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1785 swF2 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
1786 swF3 = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1787 swF4 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1789 /* Avoid stupid compiler warnings */
1791 j_coord_offsetA = 0;
1792 j_coord_offsetB = 0;
1797 /* Start outer loop over neighborlists */
1798 for(iidx=0; iidx<nri; iidx++)
1800 /* Load shift vector for this list */
1801 i_shift_offset = DIM*shiftidx[iidx];
1803 /* Load limits for loop over neighbors */
1804 j_index_start = jindex[iidx];
1805 j_index_end = jindex[iidx+1];
1807 /* Get outer coordinate index */
1809 i_coord_offset = DIM*inr;
1811 /* Load i particle coords and add shift vector */
1812 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1813 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1815 fix0 = _fjsp_setzero_v2r8();
1816 fiy0 = _fjsp_setzero_v2r8();
1817 fiz0 = _fjsp_setzero_v2r8();
1818 fix1 = _fjsp_setzero_v2r8();
1819 fiy1 = _fjsp_setzero_v2r8();
1820 fiz1 = _fjsp_setzero_v2r8();
1821 fix2 = _fjsp_setzero_v2r8();
1822 fiy2 = _fjsp_setzero_v2r8();
1823 fiz2 = _fjsp_setzero_v2r8();
1824 fix3 = _fjsp_setzero_v2r8();
1825 fiy3 = _fjsp_setzero_v2r8();
1826 fiz3 = _fjsp_setzero_v2r8();
1828 /* Start inner kernel loop */
1829 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1832 /* Get j neighbor index, and coordinate index */
1834 jnrB = jjnr[jidx+1];
1835 j_coord_offsetA = DIM*jnrA;
1836 j_coord_offsetB = DIM*jnrB;
1838 /* load j atom coordinates */
1839 gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1840 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1841 &jy2,&jz2,&jx3,&jy3,&jz3);
1843 /* Calculate displacement vector */
1844 dx00 = _fjsp_sub_v2r8(ix0,jx0);
1845 dy00 = _fjsp_sub_v2r8(iy0,jy0);
1846 dz00 = _fjsp_sub_v2r8(iz0,jz0);
1847 dx11 = _fjsp_sub_v2r8(ix1,jx1);
1848 dy11 = _fjsp_sub_v2r8(iy1,jy1);
1849 dz11 = _fjsp_sub_v2r8(iz1,jz1);
1850 dx12 = _fjsp_sub_v2r8(ix1,jx2);
1851 dy12 = _fjsp_sub_v2r8(iy1,jy2);
1852 dz12 = _fjsp_sub_v2r8(iz1,jz2);
1853 dx13 = _fjsp_sub_v2r8(ix1,jx3);
1854 dy13 = _fjsp_sub_v2r8(iy1,jy3);
1855 dz13 = _fjsp_sub_v2r8(iz1,jz3);
1856 dx21 = _fjsp_sub_v2r8(ix2,jx1);
1857 dy21 = _fjsp_sub_v2r8(iy2,jy1);
1858 dz21 = _fjsp_sub_v2r8(iz2,jz1);
1859 dx22 = _fjsp_sub_v2r8(ix2,jx2);
1860 dy22 = _fjsp_sub_v2r8(iy2,jy2);
1861 dz22 = _fjsp_sub_v2r8(iz2,jz2);
1862 dx23 = _fjsp_sub_v2r8(ix2,jx3);
1863 dy23 = _fjsp_sub_v2r8(iy2,jy3);
1864 dz23 = _fjsp_sub_v2r8(iz2,jz3);
1865 dx31 = _fjsp_sub_v2r8(ix3,jx1);
1866 dy31 = _fjsp_sub_v2r8(iy3,jy1);
1867 dz31 = _fjsp_sub_v2r8(iz3,jz1);
1868 dx32 = _fjsp_sub_v2r8(ix3,jx2);
1869 dy32 = _fjsp_sub_v2r8(iy3,jy2);
1870 dz32 = _fjsp_sub_v2r8(iz3,jz2);
1871 dx33 = _fjsp_sub_v2r8(ix3,jx3);
1872 dy33 = _fjsp_sub_v2r8(iy3,jy3);
1873 dz33 = _fjsp_sub_v2r8(iz3,jz3);
1875 /* Calculate squared distance and things based on it */
1876 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1877 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1878 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1879 rsq13 = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
1880 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1881 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1882 rsq23 = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
1883 rsq31 = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
1884 rsq32 = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
1885 rsq33 = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
1887 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
1888 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
1889 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
1890 rinv13 = gmx_fjsp_invsqrt_v2r8(rsq13);
1891 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
1892 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
1893 rinv23 = gmx_fjsp_invsqrt_v2r8(rsq23);
1894 rinv31 = gmx_fjsp_invsqrt_v2r8(rsq31);
1895 rinv32 = gmx_fjsp_invsqrt_v2r8(rsq32);
1896 rinv33 = gmx_fjsp_invsqrt_v2r8(rsq33);
1898 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
1899 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
1900 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
1901 rinvsq13 = _fjsp_mul_v2r8(rinv13,rinv13);
1902 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
1903 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
1904 rinvsq23 = _fjsp_mul_v2r8(rinv23,rinv23);
1905 rinvsq31 = _fjsp_mul_v2r8(rinv31,rinv31);
1906 rinvsq32 = _fjsp_mul_v2r8(rinv32,rinv32);
1907 rinvsq33 = _fjsp_mul_v2r8(rinv33,rinv33);
1909 fjx0 = _fjsp_setzero_v2r8();
1910 fjy0 = _fjsp_setzero_v2r8();
1911 fjz0 = _fjsp_setzero_v2r8();
1912 fjx1 = _fjsp_setzero_v2r8();
1913 fjy1 = _fjsp_setzero_v2r8();
1914 fjz1 = _fjsp_setzero_v2r8();
1915 fjx2 = _fjsp_setzero_v2r8();
1916 fjy2 = _fjsp_setzero_v2r8();
1917 fjz2 = _fjsp_setzero_v2r8();
1918 fjx3 = _fjsp_setzero_v2r8();
1919 fjy3 = _fjsp_setzero_v2r8();
1920 fjz3 = _fjsp_setzero_v2r8();
1922 /**************************
1923 * CALCULATE INTERACTIONS *
1924 **************************/
1926 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1929 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
1931 /* LENNARD-JONES DISPERSION/REPULSION */
1933 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1934 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
1935 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
1936 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
1937 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
1939 d = _fjsp_sub_v2r8(r00,rswitch);
1940 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1941 d2 = _fjsp_mul_v2r8(d,d);
1942 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1944 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1946 /* Evaluate switch function */
1947 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1948 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
1949 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1953 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1955 /* Update vectorial force */
1956 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
1957 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1958 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1960 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1961 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1962 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1966 /**************************
1967 * CALCULATE INTERACTIONS *
1968 **************************/
1970 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1973 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
1975 /* EWALD ELECTROSTATICS */
1977 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1978 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
1979 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1980 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1981 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1983 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1984 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1985 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1986 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1987 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1988 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1989 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1990 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1991 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
1992 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1994 d = _fjsp_sub_v2r8(r11,rswitch);
1995 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1996 d2 = _fjsp_mul_v2r8(d,d);
1997 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1999 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2001 /* Evaluate switch function */
2002 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2003 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2004 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2008 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2010 /* Update vectorial force */
2011 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
2012 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2013 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2015 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2016 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2017 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2021 /**************************
2022 * CALCULATE INTERACTIONS *
2023 **************************/
2025 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2028 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
2030 /* EWALD ELECTROSTATICS */
2032 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2033 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
2034 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2035 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2036 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2038 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2039 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2040 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2041 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2042 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2043 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2044 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2045 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2046 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2047 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2049 d = _fjsp_sub_v2r8(r12,rswitch);
2050 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2051 d2 = _fjsp_mul_v2r8(d,d);
2052 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2054 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2056 /* Evaluate switch function */
2057 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2058 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2059 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2063 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2065 /* Update vectorial force */
2066 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
2067 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2068 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2070 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2071 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2072 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2076 /**************************
2077 * CALCULATE INTERACTIONS *
2078 **************************/
2080 if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
2083 r13 = _fjsp_mul_v2r8(rsq13,rinv13);
2085 /* EWALD ELECTROSTATICS */
2087 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2088 ewrt = _fjsp_mul_v2r8(r13,ewtabscale);
2089 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2090 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2091 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2093 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2094 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2095 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2096 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2097 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2098 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2099 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2100 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2101 velec = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
2102 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
2104 d = _fjsp_sub_v2r8(r13,rswitch);
2105 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2106 d2 = _fjsp_mul_v2r8(d,d);
2107 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2109 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2111 /* Evaluate switch function */
2112 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2113 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
2114 cutoff_mask = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
2118 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2120 /* Update vectorial force */
2121 fix1 = _fjsp_madd_v2r8(dx13,fscal,fix1);
2122 fiy1 = _fjsp_madd_v2r8(dy13,fscal,fiy1);
2123 fiz1 = _fjsp_madd_v2r8(dz13,fscal,fiz1);
2125 fjx3 = _fjsp_madd_v2r8(dx13,fscal,fjx3);
2126 fjy3 = _fjsp_madd_v2r8(dy13,fscal,fjy3);
2127 fjz3 = _fjsp_madd_v2r8(dz13,fscal,fjz3);
2131 /**************************
2132 * CALCULATE INTERACTIONS *
2133 **************************/
2135 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2138 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
2140 /* EWALD ELECTROSTATICS */
2142 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2143 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
2144 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2145 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2146 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2148 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2149 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2150 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2151 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2152 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2153 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2154 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2155 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2156 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2157 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2159 d = _fjsp_sub_v2r8(r21,rswitch);
2160 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2161 d2 = _fjsp_mul_v2r8(d,d);
2162 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2164 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2166 /* Evaluate switch function */
2167 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2168 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2169 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2173 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2175 /* Update vectorial force */
2176 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
2177 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2178 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2180 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2181 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2182 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2186 /**************************
2187 * CALCULATE INTERACTIONS *
2188 **************************/
2190 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2193 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
2195 /* EWALD ELECTROSTATICS */
2197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2198 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
2199 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2200 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2201 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2203 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2204 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2205 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2206 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2207 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2208 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2209 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2210 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2211 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2212 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2214 d = _fjsp_sub_v2r8(r22,rswitch);
2215 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2216 d2 = _fjsp_mul_v2r8(d,d);
2217 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2219 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2221 /* Evaluate switch function */
2222 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2223 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2224 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2228 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2230 /* Update vectorial force */
2231 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
2232 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2233 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2235 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2236 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2237 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2241 /**************************
2242 * CALCULATE INTERACTIONS *
2243 **************************/
2245 if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
2248 r23 = _fjsp_mul_v2r8(rsq23,rinv23);
2250 /* EWALD ELECTROSTATICS */
2252 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2253 ewrt = _fjsp_mul_v2r8(r23,ewtabscale);
2254 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2255 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2256 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2258 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2259 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2260 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2261 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2262 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2263 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2264 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2265 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2266 velec = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
2267 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
2269 d = _fjsp_sub_v2r8(r23,rswitch);
2270 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2271 d2 = _fjsp_mul_v2r8(d,d);
2272 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2274 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2276 /* Evaluate switch function */
2277 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2278 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
2279 cutoff_mask = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
2283 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2285 /* Update vectorial force */
2286 fix2 = _fjsp_madd_v2r8(dx23,fscal,fix2);
2287 fiy2 = _fjsp_madd_v2r8(dy23,fscal,fiy2);
2288 fiz2 = _fjsp_madd_v2r8(dz23,fscal,fiz2);
2290 fjx3 = _fjsp_madd_v2r8(dx23,fscal,fjx3);
2291 fjy3 = _fjsp_madd_v2r8(dy23,fscal,fjy3);
2292 fjz3 = _fjsp_madd_v2r8(dz23,fscal,fjz3);
2296 /**************************
2297 * CALCULATE INTERACTIONS *
2298 **************************/
2300 if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
2303 r31 = _fjsp_mul_v2r8(rsq31,rinv31);
2305 /* EWALD ELECTROSTATICS */
2307 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2308 ewrt = _fjsp_mul_v2r8(r31,ewtabscale);
2309 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2310 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2311 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2313 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2314 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2315 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2316 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2317 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2318 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2319 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2320 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2321 velec = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
2322 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
2324 d = _fjsp_sub_v2r8(r31,rswitch);
2325 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2326 d2 = _fjsp_mul_v2r8(d,d);
2327 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2329 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2331 /* Evaluate switch function */
2332 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2333 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
2334 cutoff_mask = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
2338 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2340 /* Update vectorial force */
2341 fix3 = _fjsp_madd_v2r8(dx31,fscal,fix3);
2342 fiy3 = _fjsp_madd_v2r8(dy31,fscal,fiy3);
2343 fiz3 = _fjsp_madd_v2r8(dz31,fscal,fiz3);
2345 fjx1 = _fjsp_madd_v2r8(dx31,fscal,fjx1);
2346 fjy1 = _fjsp_madd_v2r8(dy31,fscal,fjy1);
2347 fjz1 = _fjsp_madd_v2r8(dz31,fscal,fjz1);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
2358 r32 = _fjsp_mul_v2r8(rsq32,rinv32);
2360 /* EWALD ELECTROSTATICS */
2362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2363 ewrt = _fjsp_mul_v2r8(r32,ewtabscale);
2364 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2365 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2366 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2368 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2369 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2370 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2371 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2372 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2373 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2374 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2375 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2376 velec = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
2377 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
2379 d = _fjsp_sub_v2r8(r32,rswitch);
2380 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2381 d2 = _fjsp_mul_v2r8(d,d);
2382 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2384 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2386 /* Evaluate switch function */
2387 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2388 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
2389 cutoff_mask = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
2393 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2395 /* Update vectorial force */
2396 fix3 = _fjsp_madd_v2r8(dx32,fscal,fix3);
2397 fiy3 = _fjsp_madd_v2r8(dy32,fscal,fiy3);
2398 fiz3 = _fjsp_madd_v2r8(dz32,fscal,fiz3);
2400 fjx2 = _fjsp_madd_v2r8(dx32,fscal,fjx2);
2401 fjy2 = _fjsp_madd_v2r8(dy32,fscal,fjy2);
2402 fjz2 = _fjsp_madd_v2r8(dz32,fscal,fjz2);
2406 /**************************
2407 * CALCULATE INTERACTIONS *
2408 **************************/
2410 if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
2413 r33 = _fjsp_mul_v2r8(rsq33,rinv33);
2415 /* EWALD ELECTROSTATICS */
2417 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2418 ewrt = _fjsp_mul_v2r8(r33,ewtabscale);
2419 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2420 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2421 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2423 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2424 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2425 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2426 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2427 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2428 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2429 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2430 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2431 velec = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
2432 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
2434 d = _fjsp_sub_v2r8(r33,rswitch);
2435 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2436 d2 = _fjsp_mul_v2r8(d,d);
2437 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2439 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2441 /* Evaluate switch function */
2442 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2443 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
2444 cutoff_mask = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
2448 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2450 /* Update vectorial force */
2451 fix3 = _fjsp_madd_v2r8(dx33,fscal,fix3);
2452 fiy3 = _fjsp_madd_v2r8(dy33,fscal,fiy3);
2453 fiz3 = _fjsp_madd_v2r8(dz33,fscal,fiz3);
2455 fjx3 = _fjsp_madd_v2r8(dx33,fscal,fjx3);
2456 fjy3 = _fjsp_madd_v2r8(dy33,fscal,fjy3);
2457 fjz3 = _fjsp_madd_v2r8(dz33,fscal,fjz3);
2461 gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2463 /* Inner loop uses 647 flops */
2466 if(jidx<j_index_end)
2470 j_coord_offsetA = DIM*jnrA;
2472 /* load j atom coordinates */
2473 gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
2474 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2475 &jy2,&jz2,&jx3,&jy3,&jz3);
2477 /* Calculate displacement vector */
2478 dx00 = _fjsp_sub_v2r8(ix0,jx0);
2479 dy00 = _fjsp_sub_v2r8(iy0,jy0);
2480 dz00 = _fjsp_sub_v2r8(iz0,jz0);
2481 dx11 = _fjsp_sub_v2r8(ix1,jx1);
2482 dy11 = _fjsp_sub_v2r8(iy1,jy1);
2483 dz11 = _fjsp_sub_v2r8(iz1,jz1);
2484 dx12 = _fjsp_sub_v2r8(ix1,jx2);
2485 dy12 = _fjsp_sub_v2r8(iy1,jy2);
2486 dz12 = _fjsp_sub_v2r8(iz1,jz2);
2487 dx13 = _fjsp_sub_v2r8(ix1,jx3);
2488 dy13 = _fjsp_sub_v2r8(iy1,jy3);
2489 dz13 = _fjsp_sub_v2r8(iz1,jz3);
2490 dx21 = _fjsp_sub_v2r8(ix2,jx1);
2491 dy21 = _fjsp_sub_v2r8(iy2,jy1);
2492 dz21 = _fjsp_sub_v2r8(iz2,jz1);
2493 dx22 = _fjsp_sub_v2r8(ix2,jx2);
2494 dy22 = _fjsp_sub_v2r8(iy2,jy2);
2495 dz22 = _fjsp_sub_v2r8(iz2,jz2);
2496 dx23 = _fjsp_sub_v2r8(ix2,jx3);
2497 dy23 = _fjsp_sub_v2r8(iy2,jy3);
2498 dz23 = _fjsp_sub_v2r8(iz2,jz3);
2499 dx31 = _fjsp_sub_v2r8(ix3,jx1);
2500 dy31 = _fjsp_sub_v2r8(iy3,jy1);
2501 dz31 = _fjsp_sub_v2r8(iz3,jz1);
2502 dx32 = _fjsp_sub_v2r8(ix3,jx2);
2503 dy32 = _fjsp_sub_v2r8(iy3,jy2);
2504 dz32 = _fjsp_sub_v2r8(iz3,jz2);
2505 dx33 = _fjsp_sub_v2r8(ix3,jx3);
2506 dy33 = _fjsp_sub_v2r8(iy3,jy3);
2507 dz33 = _fjsp_sub_v2r8(iz3,jz3);
2509 /* Calculate squared distance and things based on it */
2510 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
2511 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
2512 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
2513 rsq13 = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
2514 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
2515 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
2516 rsq23 = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
2517 rsq31 = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
2518 rsq32 = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
2519 rsq33 = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
2521 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
2522 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
2523 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
2524 rinv13 = gmx_fjsp_invsqrt_v2r8(rsq13);
2525 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
2526 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
2527 rinv23 = gmx_fjsp_invsqrt_v2r8(rsq23);
2528 rinv31 = gmx_fjsp_invsqrt_v2r8(rsq31);
2529 rinv32 = gmx_fjsp_invsqrt_v2r8(rsq32);
2530 rinv33 = gmx_fjsp_invsqrt_v2r8(rsq33);
2532 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
2533 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
2534 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
2535 rinvsq13 = _fjsp_mul_v2r8(rinv13,rinv13);
2536 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
2537 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
2538 rinvsq23 = _fjsp_mul_v2r8(rinv23,rinv23);
2539 rinvsq31 = _fjsp_mul_v2r8(rinv31,rinv31);
2540 rinvsq32 = _fjsp_mul_v2r8(rinv32,rinv32);
2541 rinvsq33 = _fjsp_mul_v2r8(rinv33,rinv33);
2543 fjx0 = _fjsp_setzero_v2r8();
2544 fjy0 = _fjsp_setzero_v2r8();
2545 fjz0 = _fjsp_setzero_v2r8();
2546 fjx1 = _fjsp_setzero_v2r8();
2547 fjy1 = _fjsp_setzero_v2r8();
2548 fjz1 = _fjsp_setzero_v2r8();
2549 fjx2 = _fjsp_setzero_v2r8();
2550 fjy2 = _fjsp_setzero_v2r8();
2551 fjz2 = _fjsp_setzero_v2r8();
2552 fjx3 = _fjsp_setzero_v2r8();
2553 fjy3 = _fjsp_setzero_v2r8();
2554 fjz3 = _fjsp_setzero_v2r8();
2556 /**************************
2557 * CALCULATE INTERACTIONS *
2558 **************************/
2560 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
2563 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
2565 /* LENNARD-JONES DISPERSION/REPULSION */
2567 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
2568 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
2569 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
2570 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
2571 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
2573 d = _fjsp_sub_v2r8(r00,rswitch);
2574 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2575 d2 = _fjsp_mul_v2r8(d,d);
2576 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2578 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2580 /* Evaluate switch function */
2581 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2582 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
2583 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
2587 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2589 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2591 /* Update vectorial force */
2592 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
2593 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
2594 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
2596 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
2597 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
2598 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
2602 /**************************
2603 * CALCULATE INTERACTIONS *
2604 **************************/
2606 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
2609 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
2611 /* EWALD ELECTROSTATICS */
2613 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2614 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
2615 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2616 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2617 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2619 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2620 ewtabD = _fjsp_setzero_v2r8();
2621 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2622 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2623 ewtabFn = _fjsp_setzero_v2r8();
2624 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2625 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2626 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2627 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
2628 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
2630 d = _fjsp_sub_v2r8(r11,rswitch);
2631 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2632 d2 = _fjsp_mul_v2r8(d,d);
2633 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2635 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2637 /* Evaluate switch function */
2638 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2639 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2640 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2644 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2646 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2648 /* Update vectorial force */
2649 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
2650 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2651 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2653 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2654 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2655 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2659 /**************************
2660 * CALCULATE INTERACTIONS *
2661 **************************/
2663 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2666 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
2668 /* EWALD ELECTROSTATICS */
2670 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2671 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
2672 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2673 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2674 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2676 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2677 ewtabD = _fjsp_setzero_v2r8();
2678 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2679 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2680 ewtabFn = _fjsp_setzero_v2r8();
2681 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2682 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2683 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2684 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2685 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2687 d = _fjsp_sub_v2r8(r12,rswitch);
2688 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2689 d2 = _fjsp_mul_v2r8(d,d);
2690 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2692 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2694 /* Evaluate switch function */
2695 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2696 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2697 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2701 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2703 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2705 /* Update vectorial force */
2706 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
2707 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2708 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2710 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2711 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2712 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2716 /**************************
2717 * CALCULATE INTERACTIONS *
2718 **************************/
2720 if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
2723 r13 = _fjsp_mul_v2r8(rsq13,rinv13);
2725 /* EWALD ELECTROSTATICS */
2727 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2728 ewrt = _fjsp_mul_v2r8(r13,ewtabscale);
2729 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2730 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2731 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2733 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2734 ewtabD = _fjsp_setzero_v2r8();
2735 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2736 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2737 ewtabFn = _fjsp_setzero_v2r8();
2738 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2739 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2740 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2741 velec = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
2742 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
2744 d = _fjsp_sub_v2r8(r13,rswitch);
2745 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2746 d2 = _fjsp_mul_v2r8(d,d);
2747 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2749 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2751 /* Evaluate switch function */
2752 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2753 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
2754 cutoff_mask = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
2758 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2760 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2762 /* Update vectorial force */
2763 fix1 = _fjsp_madd_v2r8(dx13,fscal,fix1);
2764 fiy1 = _fjsp_madd_v2r8(dy13,fscal,fiy1);
2765 fiz1 = _fjsp_madd_v2r8(dz13,fscal,fiz1);
2767 fjx3 = _fjsp_madd_v2r8(dx13,fscal,fjx3);
2768 fjy3 = _fjsp_madd_v2r8(dy13,fscal,fjy3);
2769 fjz3 = _fjsp_madd_v2r8(dz13,fscal,fjz3);
2773 /**************************
2774 * CALCULATE INTERACTIONS *
2775 **************************/
2777 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2780 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
2782 /* EWALD ELECTROSTATICS */
2784 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2785 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
2786 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2787 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2788 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2790 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2791 ewtabD = _fjsp_setzero_v2r8();
2792 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2793 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2794 ewtabFn = _fjsp_setzero_v2r8();
2795 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2796 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2797 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2798 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2799 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2801 d = _fjsp_sub_v2r8(r21,rswitch);
2802 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2803 d2 = _fjsp_mul_v2r8(d,d);
2804 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2806 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2808 /* Evaluate switch function */
2809 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2810 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2811 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2815 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2817 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2819 /* Update vectorial force */
2820 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
2821 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2822 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2824 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2825 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2826 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2830 /**************************
2831 * CALCULATE INTERACTIONS *
2832 **************************/
2834 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2837 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
2839 /* EWALD ELECTROSTATICS */
2841 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2842 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
2843 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2844 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2845 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2847 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2848 ewtabD = _fjsp_setzero_v2r8();
2849 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2850 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2851 ewtabFn = _fjsp_setzero_v2r8();
2852 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2853 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2854 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2855 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2856 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2858 d = _fjsp_sub_v2r8(r22,rswitch);
2859 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2860 d2 = _fjsp_mul_v2r8(d,d);
2861 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2863 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2865 /* Evaluate switch function */
2866 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2867 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2868 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2872 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2874 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2876 /* Update vectorial force */
2877 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
2878 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2879 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2881 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2882 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2883 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2887 /**************************
2888 * CALCULATE INTERACTIONS *
2889 **************************/
2891 if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
2894 r23 = _fjsp_mul_v2r8(rsq23,rinv23);
2896 /* EWALD ELECTROSTATICS */
2898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2899 ewrt = _fjsp_mul_v2r8(r23,ewtabscale);
2900 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2901 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2902 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2904 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2905 ewtabD = _fjsp_setzero_v2r8();
2906 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2907 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2908 ewtabFn = _fjsp_setzero_v2r8();
2909 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2910 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2911 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2912 velec = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
2913 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
2915 d = _fjsp_sub_v2r8(r23,rswitch);
2916 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2917 d2 = _fjsp_mul_v2r8(d,d);
2918 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2920 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2922 /* Evaluate switch function */
2923 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2924 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
2925 cutoff_mask = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
2929 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2931 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2933 /* Update vectorial force */
2934 fix2 = _fjsp_madd_v2r8(dx23,fscal,fix2);
2935 fiy2 = _fjsp_madd_v2r8(dy23,fscal,fiy2);
2936 fiz2 = _fjsp_madd_v2r8(dz23,fscal,fiz2);
2938 fjx3 = _fjsp_madd_v2r8(dx23,fscal,fjx3);
2939 fjy3 = _fjsp_madd_v2r8(dy23,fscal,fjy3);
2940 fjz3 = _fjsp_madd_v2r8(dz23,fscal,fjz3);
2944 /**************************
2945 * CALCULATE INTERACTIONS *
2946 **************************/
2948 if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
2951 r31 = _fjsp_mul_v2r8(rsq31,rinv31);
2953 /* EWALD ELECTROSTATICS */
2955 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2956 ewrt = _fjsp_mul_v2r8(r31,ewtabscale);
2957 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2958 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2959 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2961 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2962 ewtabD = _fjsp_setzero_v2r8();
2963 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2964 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2965 ewtabFn = _fjsp_setzero_v2r8();
2966 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2967 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2968 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2969 velec = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
2970 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
2972 d = _fjsp_sub_v2r8(r31,rswitch);
2973 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2974 d2 = _fjsp_mul_v2r8(d,d);
2975 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2977 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2979 /* Evaluate switch function */
2980 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2981 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
2982 cutoff_mask = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
2986 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2988 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2990 /* Update vectorial force */
2991 fix3 = _fjsp_madd_v2r8(dx31,fscal,fix3);
2992 fiy3 = _fjsp_madd_v2r8(dy31,fscal,fiy3);
2993 fiz3 = _fjsp_madd_v2r8(dz31,fscal,fiz3);
2995 fjx1 = _fjsp_madd_v2r8(dx31,fscal,fjx1);
2996 fjy1 = _fjsp_madd_v2r8(dy31,fscal,fjy1);
2997 fjz1 = _fjsp_madd_v2r8(dz31,fscal,fjz1);
3001 /**************************
3002 * CALCULATE INTERACTIONS *
3003 **************************/
3005 if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
3008 r32 = _fjsp_mul_v2r8(rsq32,rinv32);
3010 /* EWALD ELECTROSTATICS */
3012 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3013 ewrt = _fjsp_mul_v2r8(r32,ewtabscale);
3014 itab_tmp = _fjsp_dtox_v2r8(ewrt);
3015 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
3016 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
3018 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
3019 ewtabD = _fjsp_setzero_v2r8();
3020 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
3021 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
3022 ewtabFn = _fjsp_setzero_v2r8();
3023 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
3024 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
3025 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
3026 velec = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
3027 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
3029 d = _fjsp_sub_v2r8(r32,rswitch);
3030 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
3031 d2 = _fjsp_mul_v2r8(d,d);
3032 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
3034 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
3036 /* Evaluate switch function */
3037 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3038 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
3039 cutoff_mask = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
3043 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
3045 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
3047 /* Update vectorial force */
3048 fix3 = _fjsp_madd_v2r8(dx32,fscal,fix3);
3049 fiy3 = _fjsp_madd_v2r8(dy32,fscal,fiy3);
3050 fiz3 = _fjsp_madd_v2r8(dz32,fscal,fiz3);
3052 fjx2 = _fjsp_madd_v2r8(dx32,fscal,fjx2);
3053 fjy2 = _fjsp_madd_v2r8(dy32,fscal,fjy2);
3054 fjz2 = _fjsp_madd_v2r8(dz32,fscal,fjz2);
3058 /**************************
3059 * CALCULATE INTERACTIONS *
3060 **************************/
3062 if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
3065 r33 = _fjsp_mul_v2r8(rsq33,rinv33);
3067 /* EWALD ELECTROSTATICS */
3069 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3070 ewrt = _fjsp_mul_v2r8(r33,ewtabscale);
3071 itab_tmp = _fjsp_dtox_v2r8(ewrt);
3072 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
3073 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
3075 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
3076 ewtabD = _fjsp_setzero_v2r8();
3077 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
3078 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
3079 ewtabFn = _fjsp_setzero_v2r8();
3080 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
3081 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
3082 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
3083 velec = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
3084 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
3086 d = _fjsp_sub_v2r8(r33,rswitch);
3087 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
3088 d2 = _fjsp_mul_v2r8(d,d);
3089 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
3091 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
3093 /* Evaluate switch function */
3094 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3095 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
3096 cutoff_mask = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
3100 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
3102 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
3104 /* Update vectorial force */
3105 fix3 = _fjsp_madd_v2r8(dx33,fscal,fix3);
3106 fiy3 = _fjsp_madd_v2r8(dy33,fscal,fiy3);
3107 fiz3 = _fjsp_madd_v2r8(dz33,fscal,fiz3);
3109 fjx3 = _fjsp_madd_v2r8(dx33,fscal,fjx3);
3110 fjy3 = _fjsp_madd_v2r8(dy33,fscal,fjy3);
3111 fjz3 = _fjsp_madd_v2r8(dz33,fscal,fjz3);
3115 gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3117 /* Inner loop uses 647 flops */
3120 /* End of innermost loop */
3122 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3123 f+i_coord_offset,fshift+i_shift_offset);
3125 /* Increment number of inner iterations */
3126 inneriter += j_index_end - j_index_start;
3128 /* Outer loop uses 24 flops */
3131 /* Increment number of outer iterations */
3134 /* Update outer/inner flops */
3136 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*647);