8976491f42ff0010b65d24059554fb2abd2a3cfa
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "kernelutil_sparc64_hpc_ace_double.h"
50
51 /*
52  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
53  * Electrostatics interaction: Ewald
54  * VdW interaction:            None
55  * Geometry:                   Water3-Water3
56  * Calculate force/pot:        PotentialAndForce
57  */
58 void
59 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
60                     (t_nblist                    * gmx_restrict       nlist,
61                      rvec                        * gmx_restrict          xx,
62                      rvec                        * gmx_restrict          ff,
63                      t_forcerec                  * gmx_restrict          fr,
64                      t_mdatoms                   * gmx_restrict     mdatoms,
65                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66                      t_nrnb                      * gmx_restrict        nrnb)
67 {
68     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69      * just 0 for non-waters.
70      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
71      * jnr indices corresponding to data put in the four positions in the SIMD register.
72      */
73     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
74     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75     int              jnrA,jnrB;
76     int              j_coord_offsetA,j_coord_offsetB;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81     int              vdwioffset0;
82     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83     int              vdwioffset1;
84     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85     int              vdwioffset2;
86     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87     int              vdwjidx0A,vdwjidx0B;
88     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89     int              vdwjidx1A,vdwjidx1B;
90     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B;
92     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
95     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
96     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
100     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
103     real             *charge;
104     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105     real             *ewtab;
106     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
107     real             rswitch_scalar,d_scalar;
108     _fjsp_v2r8       itab_tmp;
109     _fjsp_v2r8       dummy_mask,cutoff_mask;
110     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
111     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
112     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
113
114     x                = xx[0];
115     f                = ff[0];
116
117     nri              = nlist->nri;
118     iinr             = nlist->iinr;
119     jindex           = nlist->jindex;
120     jjnr             = nlist->jjnr;
121     shiftidx         = nlist->shift;
122     gid              = nlist->gid;
123     shiftvec         = fr->shift_vec[0];
124     fshift           = fr->fshift[0];
125     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
126     charge           = mdatoms->chargeA;
127
128     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
129     ewtab            = fr->ic->tabq_coul_FDV0;
130     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
131     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
136     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
137     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
138
139     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
140     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
141     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
142     qq00             = _fjsp_mul_v2r8(iq0,jq0);
143     qq01             = _fjsp_mul_v2r8(iq0,jq1);
144     qq02             = _fjsp_mul_v2r8(iq0,jq2);
145     qq10             = _fjsp_mul_v2r8(iq1,jq0);
146     qq11             = _fjsp_mul_v2r8(iq1,jq1);
147     qq12             = _fjsp_mul_v2r8(iq1,jq2);
148     qq20             = _fjsp_mul_v2r8(iq2,jq0);
149     qq21             = _fjsp_mul_v2r8(iq2,jq1);
150     qq22             = _fjsp_mul_v2r8(iq2,jq2);
151
152     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
153     rcutoff_scalar   = fr->rcoulomb;
154     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
155     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
156
157     rswitch_scalar   = fr->rcoulomb_switch;
158     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
159     /* Setup switch parameters */
160     d_scalar         = rcutoff_scalar-rswitch_scalar;
161     d                = gmx_fjsp_set1_v2r8(d_scalar);
162     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
163     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
166     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
167     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
168
169     /* Avoid stupid compiler warnings */
170     jnrA = jnrB = 0;
171     j_coord_offsetA = 0;
172     j_coord_offsetB = 0;
173
174     outeriter        = 0;
175     inneriter        = 0;
176
177     /* Start outer loop over neighborlists */
178     for(iidx=0; iidx<nri; iidx++)
179     {
180         /* Load shift vector for this list */
181         i_shift_offset   = DIM*shiftidx[iidx];
182
183         /* Load limits for loop over neighbors */
184         j_index_start    = jindex[iidx];
185         j_index_end      = jindex[iidx+1];
186
187         /* Get outer coordinate index */
188         inr              = iinr[iidx];
189         i_coord_offset   = DIM*inr;
190
191         /* Load i particle coords and add shift vector */
192         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
193                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
194
195         fix0             = _fjsp_setzero_v2r8();
196         fiy0             = _fjsp_setzero_v2r8();
197         fiz0             = _fjsp_setzero_v2r8();
198         fix1             = _fjsp_setzero_v2r8();
199         fiy1             = _fjsp_setzero_v2r8();
200         fiz1             = _fjsp_setzero_v2r8();
201         fix2             = _fjsp_setzero_v2r8();
202         fiy2             = _fjsp_setzero_v2r8();
203         fiz2             = _fjsp_setzero_v2r8();
204
205         /* Reset potential sums */
206         velecsum         = _fjsp_setzero_v2r8();
207
208         /* Start inner kernel loop */
209         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
210         {
211
212             /* Get j neighbor index, and coordinate index */
213             jnrA             = jjnr[jidx];
214             jnrB             = jjnr[jidx+1];
215             j_coord_offsetA  = DIM*jnrA;
216             j_coord_offsetB  = DIM*jnrB;
217
218             /* load j atom coordinates */
219             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
220                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
221
222             /* Calculate displacement vector */
223             dx00             = _fjsp_sub_v2r8(ix0,jx0);
224             dy00             = _fjsp_sub_v2r8(iy0,jy0);
225             dz00             = _fjsp_sub_v2r8(iz0,jz0);
226             dx01             = _fjsp_sub_v2r8(ix0,jx1);
227             dy01             = _fjsp_sub_v2r8(iy0,jy1);
228             dz01             = _fjsp_sub_v2r8(iz0,jz1);
229             dx02             = _fjsp_sub_v2r8(ix0,jx2);
230             dy02             = _fjsp_sub_v2r8(iy0,jy2);
231             dz02             = _fjsp_sub_v2r8(iz0,jz2);
232             dx10             = _fjsp_sub_v2r8(ix1,jx0);
233             dy10             = _fjsp_sub_v2r8(iy1,jy0);
234             dz10             = _fjsp_sub_v2r8(iz1,jz0);
235             dx11             = _fjsp_sub_v2r8(ix1,jx1);
236             dy11             = _fjsp_sub_v2r8(iy1,jy1);
237             dz11             = _fjsp_sub_v2r8(iz1,jz1);
238             dx12             = _fjsp_sub_v2r8(ix1,jx2);
239             dy12             = _fjsp_sub_v2r8(iy1,jy2);
240             dz12             = _fjsp_sub_v2r8(iz1,jz2);
241             dx20             = _fjsp_sub_v2r8(ix2,jx0);
242             dy20             = _fjsp_sub_v2r8(iy2,jy0);
243             dz20             = _fjsp_sub_v2r8(iz2,jz0);
244             dx21             = _fjsp_sub_v2r8(ix2,jx1);
245             dy21             = _fjsp_sub_v2r8(iy2,jy1);
246             dz21             = _fjsp_sub_v2r8(iz2,jz1);
247             dx22             = _fjsp_sub_v2r8(ix2,jx2);
248             dy22             = _fjsp_sub_v2r8(iy2,jy2);
249             dz22             = _fjsp_sub_v2r8(iz2,jz2);
250
251             /* Calculate squared distance and things based on it */
252             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
253             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
254             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
255             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
256             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
257             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
258             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
259             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
260             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
261
262             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
263             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
264             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
265             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
266             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
267             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
268             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
269             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
270             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
271
272             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
273             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
274             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
275             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
276             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
277             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
278             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
279             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
280             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
281
282             fjx0             = _fjsp_setzero_v2r8();
283             fjy0             = _fjsp_setzero_v2r8();
284             fjz0             = _fjsp_setzero_v2r8();
285             fjx1             = _fjsp_setzero_v2r8();
286             fjy1             = _fjsp_setzero_v2r8();
287             fjz1             = _fjsp_setzero_v2r8();
288             fjx2             = _fjsp_setzero_v2r8();
289             fjy2             = _fjsp_setzero_v2r8();
290             fjz2             = _fjsp_setzero_v2r8();
291
292             /**************************
293              * CALCULATE INTERACTIONS *
294              **************************/
295
296             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
297             {
298
299             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
300
301             /* EWALD ELECTROSTATICS */
302
303             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
304             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
305             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
306             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
307             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
308
309             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
310             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
311             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
312             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
313             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
314             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
315             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
316             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
317             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
318             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
319
320             d                = _fjsp_sub_v2r8(r00,rswitch);
321             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
322             d2               = _fjsp_mul_v2r8(d,d);
323             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
324
325             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
326
327             /* Evaluate switch function */
328             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
329             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
330             velec            = _fjsp_mul_v2r8(velec,sw);
331             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
335             velecsum         = _fjsp_add_v2r8(velecsum,velec);
336
337             fscal            = felec;
338
339             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
340
341             /* Update vectorial force */
342             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
343             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
344             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
345             
346             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
347             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
348             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
349
350             }
351
352             /**************************
353              * CALCULATE INTERACTIONS *
354              **************************/
355
356             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
357             {
358
359             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
360
361             /* EWALD ELECTROSTATICS */
362
363             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
364             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
365             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
366             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
367             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
368
369             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
370             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
371             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
372             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
373             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
374             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
375             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
376             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
377             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
378             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
379
380             d                = _fjsp_sub_v2r8(r01,rswitch);
381             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
382             d2               = _fjsp_mul_v2r8(d,d);
383             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
384
385             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
386
387             /* Evaluate switch function */
388             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
389             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
390             velec            = _fjsp_mul_v2r8(velec,sw);
391             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
395             velecsum         = _fjsp_add_v2r8(velecsum,velec);
396
397             fscal            = felec;
398
399             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
400
401             /* Update vectorial force */
402             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
403             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
404             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
405             
406             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
407             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
408             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
409
410             }
411
412             /**************************
413              * CALCULATE INTERACTIONS *
414              **************************/
415
416             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
417             {
418
419             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
420
421             /* EWALD ELECTROSTATICS */
422
423             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
424             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
425             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
426             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
427             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
428
429             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
430             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
431             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
432             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
433             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
434             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
435             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
436             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
437             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
438             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
439
440             d                = _fjsp_sub_v2r8(r02,rswitch);
441             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
442             d2               = _fjsp_mul_v2r8(d,d);
443             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
444
445             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
446
447             /* Evaluate switch function */
448             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
449             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
450             velec            = _fjsp_mul_v2r8(velec,sw);
451             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
452
453             /* Update potential sum for this i atom from the interaction with this j atom. */
454             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
455             velecsum         = _fjsp_add_v2r8(velecsum,velec);
456
457             fscal            = felec;
458
459             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
460
461             /* Update vectorial force */
462             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
463             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
464             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
465             
466             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
467             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
468             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
469
470             }
471
472             /**************************
473              * CALCULATE INTERACTIONS *
474              **************************/
475
476             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
477             {
478
479             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
480
481             /* EWALD ELECTROSTATICS */
482
483             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
484             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
485             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
486             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
487             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
488
489             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
490             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
491             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
492             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
493             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
494             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
495             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
496             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
497             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
498             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
499
500             d                = _fjsp_sub_v2r8(r10,rswitch);
501             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
502             d2               = _fjsp_mul_v2r8(d,d);
503             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
504
505             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
506
507             /* Evaluate switch function */
508             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
509             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
510             velec            = _fjsp_mul_v2r8(velec,sw);
511             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
515             velecsum         = _fjsp_add_v2r8(velecsum,velec);
516
517             fscal            = felec;
518
519             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
520
521             /* Update vectorial force */
522             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
523             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
524             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
525             
526             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
527             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
528             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
529
530             }
531
532             /**************************
533              * CALCULATE INTERACTIONS *
534              **************************/
535
536             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
537             {
538
539             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
540
541             /* EWALD ELECTROSTATICS */
542
543             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
544             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
545             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
546             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
547             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
548
549             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
550             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
551             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
552             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
553             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
554             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
555             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
556             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
557             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
558             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
559
560             d                = _fjsp_sub_v2r8(r11,rswitch);
561             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
562             d2               = _fjsp_mul_v2r8(d,d);
563             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
564
565             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
566
567             /* Evaluate switch function */
568             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
569             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
570             velec            = _fjsp_mul_v2r8(velec,sw);
571             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
572
573             /* Update potential sum for this i atom from the interaction with this j atom. */
574             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
575             velecsum         = _fjsp_add_v2r8(velecsum,velec);
576
577             fscal            = felec;
578
579             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
580
581             /* Update vectorial force */
582             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
583             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
584             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
585             
586             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
587             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
588             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
589
590             }
591
592             /**************************
593              * CALCULATE INTERACTIONS *
594              **************************/
595
596             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
597             {
598
599             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
600
601             /* EWALD ELECTROSTATICS */
602
603             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
604             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
605             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
606             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
607             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
608
609             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
610             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
611             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
612             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
613             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
614             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
615             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
616             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
617             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
618             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
619
620             d                = _fjsp_sub_v2r8(r12,rswitch);
621             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
622             d2               = _fjsp_mul_v2r8(d,d);
623             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
624
625             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
626
627             /* Evaluate switch function */
628             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
629             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
630             velec            = _fjsp_mul_v2r8(velec,sw);
631             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
632
633             /* Update potential sum for this i atom from the interaction with this j atom. */
634             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
635             velecsum         = _fjsp_add_v2r8(velecsum,velec);
636
637             fscal            = felec;
638
639             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
640
641             /* Update vectorial force */
642             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
643             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
644             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
645             
646             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
647             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
648             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
649
650             }
651
652             /**************************
653              * CALCULATE INTERACTIONS *
654              **************************/
655
656             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
657             {
658
659             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
660
661             /* EWALD ELECTROSTATICS */
662
663             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
664             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
665             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
666             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
667             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
668
669             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
670             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
671             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
672             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
673             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
674             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
675             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
676             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
677             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
678             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
679
680             d                = _fjsp_sub_v2r8(r20,rswitch);
681             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
682             d2               = _fjsp_mul_v2r8(d,d);
683             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
684
685             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
686
687             /* Evaluate switch function */
688             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
689             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
690             velec            = _fjsp_mul_v2r8(velec,sw);
691             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
692
693             /* Update potential sum for this i atom from the interaction with this j atom. */
694             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
695             velecsum         = _fjsp_add_v2r8(velecsum,velec);
696
697             fscal            = felec;
698
699             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
700
701             /* Update vectorial force */
702             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
703             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
704             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
705             
706             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
707             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
708             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
709
710             }
711
712             /**************************
713              * CALCULATE INTERACTIONS *
714              **************************/
715
716             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
717             {
718
719             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
720
721             /* EWALD ELECTROSTATICS */
722
723             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
724             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
725             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
726             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
727             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
728
729             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
730             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
731             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
732             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
733             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
734             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
735             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
736             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
737             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
738             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
739
740             d                = _fjsp_sub_v2r8(r21,rswitch);
741             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
742             d2               = _fjsp_mul_v2r8(d,d);
743             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
744
745             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
746
747             /* Evaluate switch function */
748             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
749             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
750             velec            = _fjsp_mul_v2r8(velec,sw);
751             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
752
753             /* Update potential sum for this i atom from the interaction with this j atom. */
754             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
755             velecsum         = _fjsp_add_v2r8(velecsum,velec);
756
757             fscal            = felec;
758
759             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
760
761             /* Update vectorial force */
762             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
763             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
764             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
765             
766             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
767             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
768             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
769
770             }
771
772             /**************************
773              * CALCULATE INTERACTIONS *
774              **************************/
775
776             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
777             {
778
779             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
780
781             /* EWALD ELECTROSTATICS */
782
783             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
784             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
785             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
786             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
787             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
788
789             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
790             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
791             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
792             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
793             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
794             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
795             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
796             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
797             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
798             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
799
800             d                = _fjsp_sub_v2r8(r22,rswitch);
801             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
802             d2               = _fjsp_mul_v2r8(d,d);
803             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
804
805             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
806
807             /* Evaluate switch function */
808             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
809             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
810             velec            = _fjsp_mul_v2r8(velec,sw);
811             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
812
813             /* Update potential sum for this i atom from the interaction with this j atom. */
814             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
815             velecsum         = _fjsp_add_v2r8(velecsum,velec);
816
817             fscal            = felec;
818
819             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
820
821             /* Update vectorial force */
822             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
823             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
824             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
825             
826             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
827             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
828             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
829
830             }
831
832             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
833
834             /* Inner loop uses 612 flops */
835         }
836
837         if(jidx<j_index_end)
838         {
839
840             jnrA             = jjnr[jidx];
841             j_coord_offsetA  = DIM*jnrA;
842
843             /* load j atom coordinates */
844             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
845                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
846
847             /* Calculate displacement vector */
848             dx00             = _fjsp_sub_v2r8(ix0,jx0);
849             dy00             = _fjsp_sub_v2r8(iy0,jy0);
850             dz00             = _fjsp_sub_v2r8(iz0,jz0);
851             dx01             = _fjsp_sub_v2r8(ix0,jx1);
852             dy01             = _fjsp_sub_v2r8(iy0,jy1);
853             dz01             = _fjsp_sub_v2r8(iz0,jz1);
854             dx02             = _fjsp_sub_v2r8(ix0,jx2);
855             dy02             = _fjsp_sub_v2r8(iy0,jy2);
856             dz02             = _fjsp_sub_v2r8(iz0,jz2);
857             dx10             = _fjsp_sub_v2r8(ix1,jx0);
858             dy10             = _fjsp_sub_v2r8(iy1,jy0);
859             dz10             = _fjsp_sub_v2r8(iz1,jz0);
860             dx11             = _fjsp_sub_v2r8(ix1,jx1);
861             dy11             = _fjsp_sub_v2r8(iy1,jy1);
862             dz11             = _fjsp_sub_v2r8(iz1,jz1);
863             dx12             = _fjsp_sub_v2r8(ix1,jx2);
864             dy12             = _fjsp_sub_v2r8(iy1,jy2);
865             dz12             = _fjsp_sub_v2r8(iz1,jz2);
866             dx20             = _fjsp_sub_v2r8(ix2,jx0);
867             dy20             = _fjsp_sub_v2r8(iy2,jy0);
868             dz20             = _fjsp_sub_v2r8(iz2,jz0);
869             dx21             = _fjsp_sub_v2r8(ix2,jx1);
870             dy21             = _fjsp_sub_v2r8(iy2,jy1);
871             dz21             = _fjsp_sub_v2r8(iz2,jz1);
872             dx22             = _fjsp_sub_v2r8(ix2,jx2);
873             dy22             = _fjsp_sub_v2r8(iy2,jy2);
874             dz22             = _fjsp_sub_v2r8(iz2,jz2);
875
876             /* Calculate squared distance and things based on it */
877             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
878             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
879             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
880             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
881             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
882             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
883             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
884             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
885             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
886
887             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
888             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
889             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
890             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
891             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
892             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
893             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
894             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
895             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
896
897             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
898             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
899             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
900             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
901             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
902             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
903             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
904             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
905             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
906
907             fjx0             = _fjsp_setzero_v2r8();
908             fjy0             = _fjsp_setzero_v2r8();
909             fjz0             = _fjsp_setzero_v2r8();
910             fjx1             = _fjsp_setzero_v2r8();
911             fjy1             = _fjsp_setzero_v2r8();
912             fjz1             = _fjsp_setzero_v2r8();
913             fjx2             = _fjsp_setzero_v2r8();
914             fjy2             = _fjsp_setzero_v2r8();
915             fjz2             = _fjsp_setzero_v2r8();
916
917             /**************************
918              * CALCULATE INTERACTIONS *
919              **************************/
920
921             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
922             {
923
924             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
925
926             /* EWALD ELECTROSTATICS */
927
928             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
929             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
930             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
931             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
932             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
933
934             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
935             ewtabD           = _fjsp_setzero_v2r8();
936             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
937             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
938             ewtabFn          = _fjsp_setzero_v2r8();
939             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
940             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
941             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
942             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
943             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
944
945             d                = _fjsp_sub_v2r8(r00,rswitch);
946             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
947             d2               = _fjsp_mul_v2r8(d,d);
948             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
949
950             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
951
952             /* Evaluate switch function */
953             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
954             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
955             velec            = _fjsp_mul_v2r8(velec,sw);
956             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
957
958             /* Update potential sum for this i atom from the interaction with this j atom. */
959             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
960             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
961             velecsum         = _fjsp_add_v2r8(velecsum,velec);
962
963             fscal            = felec;
964
965             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
966
967             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
968
969             /* Update vectorial force */
970             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
971             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
972             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
973             
974             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
975             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
976             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
977
978             }
979
980             /**************************
981              * CALCULATE INTERACTIONS *
982              **************************/
983
984             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
985             {
986
987             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
988
989             /* EWALD ELECTROSTATICS */
990
991             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
992             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
993             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
994             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
995             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
996
997             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
998             ewtabD           = _fjsp_setzero_v2r8();
999             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1000             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1001             ewtabFn          = _fjsp_setzero_v2r8();
1002             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1003             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1004             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1005             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
1006             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1007
1008             d                = _fjsp_sub_v2r8(r01,rswitch);
1009             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1010             d2               = _fjsp_mul_v2r8(d,d);
1011             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1012
1013             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1014
1015             /* Evaluate switch function */
1016             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1017             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
1018             velec            = _fjsp_mul_v2r8(velec,sw);
1019             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
1020
1021             /* Update potential sum for this i atom from the interaction with this j atom. */
1022             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1023             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1024             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1025
1026             fscal            = felec;
1027
1028             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1029
1030             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1031
1032             /* Update vectorial force */
1033             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
1034             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1035             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1036             
1037             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1038             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1039             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1040
1041             }
1042
1043             /**************************
1044              * CALCULATE INTERACTIONS *
1045              **************************/
1046
1047             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
1048             {
1049
1050             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
1051
1052             /* EWALD ELECTROSTATICS */
1053
1054             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1055             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
1056             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1057             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1058             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1059
1060             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1061             ewtabD           = _fjsp_setzero_v2r8();
1062             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1063             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1064             ewtabFn          = _fjsp_setzero_v2r8();
1065             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1066             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1067             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1068             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
1069             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1070
1071             d                = _fjsp_sub_v2r8(r02,rswitch);
1072             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1073             d2               = _fjsp_mul_v2r8(d,d);
1074             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1075
1076             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1077
1078             /* Evaluate switch function */
1079             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1080             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
1081             velec            = _fjsp_mul_v2r8(velec,sw);
1082             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
1083
1084             /* Update potential sum for this i atom from the interaction with this j atom. */
1085             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1086             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1087             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1088
1089             fscal            = felec;
1090
1091             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1092
1093             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1094
1095             /* Update vectorial force */
1096             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
1097             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1098             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1099             
1100             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1101             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1102             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1103
1104             }
1105
1106             /**************************
1107              * CALCULATE INTERACTIONS *
1108              **************************/
1109
1110             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1111             {
1112
1113             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
1114
1115             /* EWALD ELECTROSTATICS */
1116
1117             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1118             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
1119             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1120             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1121             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1122
1123             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1124             ewtabD           = _fjsp_setzero_v2r8();
1125             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1126             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1127             ewtabFn          = _fjsp_setzero_v2r8();
1128             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1129             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1130             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1131             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
1132             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1133
1134             d                = _fjsp_sub_v2r8(r10,rswitch);
1135             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1136             d2               = _fjsp_mul_v2r8(d,d);
1137             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1138
1139             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1140
1141             /* Evaluate switch function */
1142             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1143             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
1144             velec            = _fjsp_mul_v2r8(velec,sw);
1145             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
1146
1147             /* Update potential sum for this i atom from the interaction with this j atom. */
1148             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1149             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1150             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1151
1152             fscal            = felec;
1153
1154             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1155
1156             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1157
1158             /* Update vectorial force */
1159             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
1160             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1161             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1162             
1163             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1164             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1165             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1166
1167             }
1168
1169             /**************************
1170              * CALCULATE INTERACTIONS *
1171              **************************/
1172
1173             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1174             {
1175
1176             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1177
1178             /* EWALD ELECTROSTATICS */
1179
1180             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1181             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1182             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1183             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1184             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1185
1186             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1187             ewtabD           = _fjsp_setzero_v2r8();
1188             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1189             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1190             ewtabFn          = _fjsp_setzero_v2r8();
1191             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1192             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1193             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1194             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
1195             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1196
1197             d                = _fjsp_sub_v2r8(r11,rswitch);
1198             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1199             d2               = _fjsp_mul_v2r8(d,d);
1200             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1201
1202             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1203
1204             /* Evaluate switch function */
1205             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1206             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
1207             velec            = _fjsp_mul_v2r8(velec,sw);
1208             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1209
1210             /* Update potential sum for this i atom from the interaction with this j atom. */
1211             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1212             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1213             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1214
1215             fscal            = felec;
1216
1217             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1218
1219             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1220
1221             /* Update vectorial force */
1222             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1223             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1224             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1225             
1226             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1227             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1228             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1229
1230             }
1231
1232             /**************************
1233              * CALCULATE INTERACTIONS *
1234              **************************/
1235
1236             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
1237             {
1238
1239             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1240
1241             /* EWALD ELECTROSTATICS */
1242
1243             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1244             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1245             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1246             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1247             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1248
1249             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1250             ewtabD           = _fjsp_setzero_v2r8();
1251             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1252             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1253             ewtabFn          = _fjsp_setzero_v2r8();
1254             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1255             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1256             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1257             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
1258             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1259
1260             d                = _fjsp_sub_v2r8(r12,rswitch);
1261             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1262             d2               = _fjsp_mul_v2r8(d,d);
1263             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1264
1265             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1266
1267             /* Evaluate switch function */
1268             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1269             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
1270             velec            = _fjsp_mul_v2r8(velec,sw);
1271             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
1272
1273             /* Update potential sum for this i atom from the interaction with this j atom. */
1274             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1275             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1276             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1277
1278             fscal            = felec;
1279
1280             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1281
1282             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1283
1284             /* Update vectorial force */
1285             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1286             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1287             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1288             
1289             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1290             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1291             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1292
1293             }
1294
1295             /**************************
1296              * CALCULATE INTERACTIONS *
1297              **************************/
1298
1299             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
1300             {
1301
1302             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
1303
1304             /* EWALD ELECTROSTATICS */
1305
1306             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1307             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
1308             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1309             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1310             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1311
1312             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1313             ewtabD           = _fjsp_setzero_v2r8();
1314             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1315             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1316             ewtabFn          = _fjsp_setzero_v2r8();
1317             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1318             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1319             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1320             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
1321             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1322
1323             d                = _fjsp_sub_v2r8(r20,rswitch);
1324             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1325             d2               = _fjsp_mul_v2r8(d,d);
1326             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1327
1328             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1329
1330             /* Evaluate switch function */
1331             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1332             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
1333             velec            = _fjsp_mul_v2r8(velec,sw);
1334             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
1335
1336             /* Update potential sum for this i atom from the interaction with this j atom. */
1337             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1338             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1339             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1340
1341             fscal            = felec;
1342
1343             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1344
1345             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1346
1347             /* Update vectorial force */
1348             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
1349             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1350             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1351             
1352             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1353             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1354             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1355
1356             }
1357
1358             /**************************
1359              * CALCULATE INTERACTIONS *
1360              **************************/
1361
1362             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
1363             {
1364
1365             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1366
1367             /* EWALD ELECTROSTATICS */
1368
1369             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1370             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1371             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1372             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1373             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1374
1375             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1376             ewtabD           = _fjsp_setzero_v2r8();
1377             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1378             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1379             ewtabFn          = _fjsp_setzero_v2r8();
1380             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1381             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1382             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1383             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
1384             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1385
1386             d                = _fjsp_sub_v2r8(r21,rswitch);
1387             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1388             d2               = _fjsp_mul_v2r8(d,d);
1389             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1390
1391             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1392
1393             /* Evaluate switch function */
1394             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1395             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
1396             velec            = _fjsp_mul_v2r8(velec,sw);
1397             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
1398
1399             /* Update potential sum for this i atom from the interaction with this j atom. */
1400             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1401             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1402             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1403
1404             fscal            = felec;
1405
1406             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1407
1408             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1409
1410             /* Update vectorial force */
1411             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1412             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1413             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1414             
1415             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1416             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1417             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1418
1419             }
1420
1421             /**************************
1422              * CALCULATE INTERACTIONS *
1423              **************************/
1424
1425             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1426             {
1427
1428             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1429
1430             /* EWALD ELECTROSTATICS */
1431
1432             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1433             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1434             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1435             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1436             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1437
1438             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1439             ewtabD           = _fjsp_setzero_v2r8();
1440             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1441             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1442             ewtabFn          = _fjsp_setzero_v2r8();
1443             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1444             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1445             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1446             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
1447             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1448
1449             d                = _fjsp_sub_v2r8(r22,rswitch);
1450             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1451             d2               = _fjsp_mul_v2r8(d,d);
1452             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1453
1454             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1455
1456             /* Evaluate switch function */
1457             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1458             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
1459             velec            = _fjsp_mul_v2r8(velec,sw);
1460             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1461
1462             /* Update potential sum for this i atom from the interaction with this j atom. */
1463             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1464             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1465             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1466
1467             fscal            = felec;
1468
1469             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1470
1471             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1472
1473             /* Update vectorial force */
1474             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1475             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1476             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1477             
1478             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1479             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1480             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1481
1482             }
1483
1484             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1485
1486             /* Inner loop uses 612 flops */
1487         }
1488
1489         /* End of innermost loop */
1490
1491         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1492                                               f+i_coord_offset,fshift+i_shift_offset);
1493
1494         ggid                        = gid[iidx];
1495         /* Update potential energies */
1496         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1497
1498         /* Increment number of inner iterations */
1499         inneriter                  += j_index_end - j_index_start;
1500
1501         /* Outer loop uses 19 flops */
1502     }
1503
1504     /* Increment number of outer iterations */
1505     outeriter        += nri;
1506
1507     /* Update outer/inner flops */
1508
1509     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*612);
1510 }
1511 /*
1512  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
1513  * Electrostatics interaction: Ewald
1514  * VdW interaction:            None
1515  * Geometry:                   Water3-Water3
1516  * Calculate force/pot:        Force
1517  */
1518 void
1519 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
1520                     (t_nblist                    * gmx_restrict       nlist,
1521                      rvec                        * gmx_restrict          xx,
1522                      rvec                        * gmx_restrict          ff,
1523                      t_forcerec                  * gmx_restrict          fr,
1524                      t_mdatoms                   * gmx_restrict     mdatoms,
1525                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1526                      t_nrnb                      * gmx_restrict        nrnb)
1527 {
1528     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1529      * just 0 for non-waters.
1530      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1531      * jnr indices corresponding to data put in the four positions in the SIMD register.
1532      */
1533     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1534     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1535     int              jnrA,jnrB;
1536     int              j_coord_offsetA,j_coord_offsetB;
1537     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1538     real             rcutoff_scalar;
1539     real             *shiftvec,*fshift,*x,*f;
1540     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1541     int              vdwioffset0;
1542     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1543     int              vdwioffset1;
1544     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1545     int              vdwioffset2;
1546     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1547     int              vdwjidx0A,vdwjidx0B;
1548     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1549     int              vdwjidx1A,vdwjidx1B;
1550     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1551     int              vdwjidx2A,vdwjidx2B;
1552     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1553     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1554     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1555     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1556     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1557     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1558     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1559     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1560     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1561     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1562     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
1563     real             *charge;
1564     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1565     real             *ewtab;
1566     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1567     real             rswitch_scalar,d_scalar;
1568     _fjsp_v2r8       itab_tmp;
1569     _fjsp_v2r8       dummy_mask,cutoff_mask;
1570     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
1571     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
1572     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1573
1574     x                = xx[0];
1575     f                = ff[0];
1576
1577     nri              = nlist->nri;
1578     iinr             = nlist->iinr;
1579     jindex           = nlist->jindex;
1580     jjnr             = nlist->jjnr;
1581     shiftidx         = nlist->shift;
1582     gid              = nlist->gid;
1583     shiftvec         = fr->shift_vec[0];
1584     fshift           = fr->fshift[0];
1585     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
1586     charge           = mdatoms->chargeA;
1587
1588     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1589     ewtab            = fr->ic->tabq_coul_FDV0;
1590     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1591     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1592
1593     /* Setup water-specific parameters */
1594     inr              = nlist->iinr[0];
1595     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
1596     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1597     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1598
1599     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
1600     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
1601     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
1602     qq00             = _fjsp_mul_v2r8(iq0,jq0);
1603     qq01             = _fjsp_mul_v2r8(iq0,jq1);
1604     qq02             = _fjsp_mul_v2r8(iq0,jq2);
1605     qq10             = _fjsp_mul_v2r8(iq1,jq0);
1606     qq11             = _fjsp_mul_v2r8(iq1,jq1);
1607     qq12             = _fjsp_mul_v2r8(iq1,jq2);
1608     qq20             = _fjsp_mul_v2r8(iq2,jq0);
1609     qq21             = _fjsp_mul_v2r8(iq2,jq1);
1610     qq22             = _fjsp_mul_v2r8(iq2,jq2);
1611
1612     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1613     rcutoff_scalar   = fr->rcoulomb;
1614     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
1615     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
1616
1617     rswitch_scalar   = fr->rcoulomb_switch;
1618     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
1619     /* Setup switch parameters */
1620     d_scalar         = rcutoff_scalar-rswitch_scalar;
1621     d                = gmx_fjsp_set1_v2r8(d_scalar);
1622     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
1623     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1624     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1625     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
1626     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1627     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1628
1629     /* Avoid stupid compiler warnings */
1630     jnrA = jnrB = 0;
1631     j_coord_offsetA = 0;
1632     j_coord_offsetB = 0;
1633
1634     outeriter        = 0;
1635     inneriter        = 0;
1636
1637     /* Start outer loop over neighborlists */
1638     for(iidx=0; iidx<nri; iidx++)
1639     {
1640         /* Load shift vector for this list */
1641         i_shift_offset   = DIM*shiftidx[iidx];
1642
1643         /* Load limits for loop over neighbors */
1644         j_index_start    = jindex[iidx];
1645         j_index_end      = jindex[iidx+1];
1646
1647         /* Get outer coordinate index */
1648         inr              = iinr[iidx];
1649         i_coord_offset   = DIM*inr;
1650
1651         /* Load i particle coords and add shift vector */
1652         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1653                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1654
1655         fix0             = _fjsp_setzero_v2r8();
1656         fiy0             = _fjsp_setzero_v2r8();
1657         fiz0             = _fjsp_setzero_v2r8();
1658         fix1             = _fjsp_setzero_v2r8();
1659         fiy1             = _fjsp_setzero_v2r8();
1660         fiz1             = _fjsp_setzero_v2r8();
1661         fix2             = _fjsp_setzero_v2r8();
1662         fiy2             = _fjsp_setzero_v2r8();
1663         fiz2             = _fjsp_setzero_v2r8();
1664
1665         /* Start inner kernel loop */
1666         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1667         {
1668
1669             /* Get j neighbor index, and coordinate index */
1670             jnrA             = jjnr[jidx];
1671             jnrB             = jjnr[jidx+1];
1672             j_coord_offsetA  = DIM*jnrA;
1673             j_coord_offsetB  = DIM*jnrB;
1674
1675             /* load j atom coordinates */
1676             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1677                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1678
1679             /* Calculate displacement vector */
1680             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1681             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1682             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1683             dx01             = _fjsp_sub_v2r8(ix0,jx1);
1684             dy01             = _fjsp_sub_v2r8(iy0,jy1);
1685             dz01             = _fjsp_sub_v2r8(iz0,jz1);
1686             dx02             = _fjsp_sub_v2r8(ix0,jx2);
1687             dy02             = _fjsp_sub_v2r8(iy0,jy2);
1688             dz02             = _fjsp_sub_v2r8(iz0,jz2);
1689             dx10             = _fjsp_sub_v2r8(ix1,jx0);
1690             dy10             = _fjsp_sub_v2r8(iy1,jy0);
1691             dz10             = _fjsp_sub_v2r8(iz1,jz0);
1692             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1693             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1694             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1695             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1696             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1697             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1698             dx20             = _fjsp_sub_v2r8(ix2,jx0);
1699             dy20             = _fjsp_sub_v2r8(iy2,jy0);
1700             dz20             = _fjsp_sub_v2r8(iz2,jz0);
1701             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1702             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1703             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1704             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1705             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1706             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1707
1708             /* Calculate squared distance and things based on it */
1709             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1710             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
1711             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
1712             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1713             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1714             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1715             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1716             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1717             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1718
1719             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1720             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
1721             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
1722             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
1723             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1724             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1725             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
1726             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1727             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1728
1729             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1730             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
1731             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
1732             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
1733             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1734             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1735             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
1736             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1737             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1738
1739             fjx0             = _fjsp_setzero_v2r8();
1740             fjy0             = _fjsp_setzero_v2r8();
1741             fjz0             = _fjsp_setzero_v2r8();
1742             fjx1             = _fjsp_setzero_v2r8();
1743             fjy1             = _fjsp_setzero_v2r8();
1744             fjz1             = _fjsp_setzero_v2r8();
1745             fjx2             = _fjsp_setzero_v2r8();
1746             fjy2             = _fjsp_setzero_v2r8();
1747             fjz2             = _fjsp_setzero_v2r8();
1748
1749             /**************************
1750              * CALCULATE INTERACTIONS *
1751              **************************/
1752
1753             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1754             {
1755
1756             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1757
1758             /* EWALD ELECTROSTATICS */
1759
1760             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1761             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
1762             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1763             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1764             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1765
1766             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1767             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1768             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1769             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1770             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1771             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1772             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1773             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1774             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
1775             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
1776
1777             d                = _fjsp_sub_v2r8(r00,rswitch);
1778             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1779             d2               = _fjsp_mul_v2r8(d,d);
1780             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1781
1782             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1783
1784             /* Evaluate switch function */
1785             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1786             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
1787             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1788
1789             fscal            = felec;
1790
1791             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1792
1793             /* Update vectorial force */
1794             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1795             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1796             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1797             
1798             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1799             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1800             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1801
1802             }
1803
1804             /**************************
1805              * CALCULATE INTERACTIONS *
1806              **************************/
1807
1808             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
1809             {
1810
1811             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
1812
1813             /* EWALD ELECTROSTATICS */
1814
1815             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1816             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
1817             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1818             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1819             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1820
1821             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1822             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1823             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1824             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1825             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1826             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1827             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1828             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1829             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
1830             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1831
1832             d                = _fjsp_sub_v2r8(r01,rswitch);
1833             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1834             d2               = _fjsp_mul_v2r8(d,d);
1835             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1836
1837             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1838
1839             /* Evaluate switch function */
1840             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1841             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
1842             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
1843
1844             fscal            = felec;
1845
1846             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1847
1848             /* Update vectorial force */
1849             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
1850             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1851             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1852             
1853             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1854             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1855             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1856
1857             }
1858
1859             /**************************
1860              * CALCULATE INTERACTIONS *
1861              **************************/
1862
1863             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
1864             {
1865
1866             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
1867
1868             /* EWALD ELECTROSTATICS */
1869
1870             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1871             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
1872             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1873             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1874             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1875
1876             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1877             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1878             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1879             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1880             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1881             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1882             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1883             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1884             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
1885             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1886
1887             d                = _fjsp_sub_v2r8(r02,rswitch);
1888             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1889             d2               = _fjsp_mul_v2r8(d,d);
1890             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1891
1892             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1893
1894             /* Evaluate switch function */
1895             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1896             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
1897             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
1898
1899             fscal            = felec;
1900
1901             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1902
1903             /* Update vectorial force */
1904             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
1905             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1906             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1907             
1908             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1909             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1910             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1911
1912             }
1913
1914             /**************************
1915              * CALCULATE INTERACTIONS *
1916              **************************/
1917
1918             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1919             {
1920
1921             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
1922
1923             /* EWALD ELECTROSTATICS */
1924
1925             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1926             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
1927             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1928             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1929             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1930
1931             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1932             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1933             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1934             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1935             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1936             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1937             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1938             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1939             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
1940             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1941
1942             d                = _fjsp_sub_v2r8(r10,rswitch);
1943             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1944             d2               = _fjsp_mul_v2r8(d,d);
1945             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1946
1947             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1948
1949             /* Evaluate switch function */
1950             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1951             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
1952             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
1953
1954             fscal            = felec;
1955
1956             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1957
1958             /* Update vectorial force */
1959             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
1960             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1961             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1962             
1963             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1964             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1965             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1966
1967             }
1968
1969             /**************************
1970              * CALCULATE INTERACTIONS *
1971              **************************/
1972
1973             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1974             {
1975
1976             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1977
1978             /* EWALD ELECTROSTATICS */
1979
1980             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1981             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1982             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1983             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1984             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1985
1986             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1987             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1988             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1989             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1990             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1991             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1992             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1993             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1994             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
1995             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1996
1997             d                = _fjsp_sub_v2r8(r11,rswitch);
1998             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1999             d2               = _fjsp_mul_v2r8(d,d);
2000             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2001
2002             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2003
2004             /* Evaluate switch function */
2005             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2006             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2007             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2008
2009             fscal            = felec;
2010
2011             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2012
2013             /* Update vectorial force */
2014             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
2015             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2016             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2017             
2018             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2019             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2020             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2021
2022             }
2023
2024             /**************************
2025              * CALCULATE INTERACTIONS *
2026              **************************/
2027
2028             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2029             {
2030
2031             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
2032
2033             /* EWALD ELECTROSTATICS */
2034
2035             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2036             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
2037             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2038             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2039             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2040
2041             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2042             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2043             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2044             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2045             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2046             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2047             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2048             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2049             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2050             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2051
2052             d                = _fjsp_sub_v2r8(r12,rswitch);
2053             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2054             d2               = _fjsp_mul_v2r8(d,d);
2055             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2056
2057             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2058
2059             /* Evaluate switch function */
2060             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2061             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2062             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2063
2064             fscal            = felec;
2065
2066             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2067
2068             /* Update vectorial force */
2069             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
2070             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2071             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2072             
2073             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2074             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2075             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2076
2077             }
2078
2079             /**************************
2080              * CALCULATE INTERACTIONS *
2081              **************************/
2082
2083             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
2084             {
2085
2086             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
2087
2088             /* EWALD ELECTROSTATICS */
2089
2090             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2091             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
2092             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2093             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2094             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2095
2096             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2097             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2098             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2099             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2100             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2101             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2102             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2103             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2104             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
2105             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
2106
2107             d                = _fjsp_sub_v2r8(r20,rswitch);
2108             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2109             d2               = _fjsp_mul_v2r8(d,d);
2110             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2111
2112             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2113
2114             /* Evaluate switch function */
2115             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2116             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
2117             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
2118
2119             fscal            = felec;
2120
2121             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2122
2123             /* Update vectorial force */
2124             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
2125             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
2126             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
2127             
2128             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
2129             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
2130             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
2131
2132             }
2133
2134             /**************************
2135              * CALCULATE INTERACTIONS *
2136              **************************/
2137
2138             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2139             {
2140
2141             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2142
2143             /* EWALD ELECTROSTATICS */
2144
2145             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2146             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2147             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2148             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2149             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2150
2151             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2152             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2153             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2154             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2155             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2156             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2157             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2158             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2159             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2160             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2161
2162             d                = _fjsp_sub_v2r8(r21,rswitch);
2163             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2164             d2               = _fjsp_mul_v2r8(d,d);
2165             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2166
2167             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2168
2169             /* Evaluate switch function */
2170             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2171             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2172             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2173
2174             fscal            = felec;
2175
2176             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2177
2178             /* Update vectorial force */
2179             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2180             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2181             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2182             
2183             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2184             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2185             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2186
2187             }
2188
2189             /**************************
2190              * CALCULATE INTERACTIONS *
2191              **************************/
2192
2193             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2194             {
2195
2196             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2197
2198             /* EWALD ELECTROSTATICS */
2199
2200             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2201             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2202             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2203             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2204             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2205
2206             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2207             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2208             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2209             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2210             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2211             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2212             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2213             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2214             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2215             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2216
2217             d                = _fjsp_sub_v2r8(r22,rswitch);
2218             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2219             d2               = _fjsp_mul_v2r8(d,d);
2220             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2221
2222             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2223
2224             /* Evaluate switch function */
2225             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2226             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2227             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2228
2229             fscal            = felec;
2230
2231             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2232
2233             /* Update vectorial force */
2234             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2235             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2236             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2237             
2238             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2239             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2240             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2241
2242             }
2243
2244             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2245
2246             /* Inner loop uses 585 flops */
2247         }
2248
2249         if(jidx<j_index_end)
2250         {
2251
2252             jnrA             = jjnr[jidx];
2253             j_coord_offsetA  = DIM*jnrA;
2254
2255             /* load j atom coordinates */
2256             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
2257                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2258
2259             /* Calculate displacement vector */
2260             dx00             = _fjsp_sub_v2r8(ix0,jx0);
2261             dy00             = _fjsp_sub_v2r8(iy0,jy0);
2262             dz00             = _fjsp_sub_v2r8(iz0,jz0);
2263             dx01             = _fjsp_sub_v2r8(ix0,jx1);
2264             dy01             = _fjsp_sub_v2r8(iy0,jy1);
2265             dz01             = _fjsp_sub_v2r8(iz0,jz1);
2266             dx02             = _fjsp_sub_v2r8(ix0,jx2);
2267             dy02             = _fjsp_sub_v2r8(iy0,jy2);
2268             dz02             = _fjsp_sub_v2r8(iz0,jz2);
2269             dx10             = _fjsp_sub_v2r8(ix1,jx0);
2270             dy10             = _fjsp_sub_v2r8(iy1,jy0);
2271             dz10             = _fjsp_sub_v2r8(iz1,jz0);
2272             dx11             = _fjsp_sub_v2r8(ix1,jx1);
2273             dy11             = _fjsp_sub_v2r8(iy1,jy1);
2274             dz11             = _fjsp_sub_v2r8(iz1,jz1);
2275             dx12             = _fjsp_sub_v2r8(ix1,jx2);
2276             dy12             = _fjsp_sub_v2r8(iy1,jy2);
2277             dz12             = _fjsp_sub_v2r8(iz1,jz2);
2278             dx20             = _fjsp_sub_v2r8(ix2,jx0);
2279             dy20             = _fjsp_sub_v2r8(iy2,jy0);
2280             dz20             = _fjsp_sub_v2r8(iz2,jz0);
2281             dx21             = _fjsp_sub_v2r8(ix2,jx1);
2282             dy21             = _fjsp_sub_v2r8(iy2,jy1);
2283             dz21             = _fjsp_sub_v2r8(iz2,jz1);
2284             dx22             = _fjsp_sub_v2r8(ix2,jx2);
2285             dy22             = _fjsp_sub_v2r8(iy2,jy2);
2286             dz22             = _fjsp_sub_v2r8(iz2,jz2);
2287
2288             /* Calculate squared distance and things based on it */
2289             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
2290             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
2291             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
2292             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
2293             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
2294             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
2295             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
2296             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
2297             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
2298
2299             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
2300             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
2301             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
2302             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
2303             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
2304             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
2305             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
2306             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
2307             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
2308
2309             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
2310             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
2311             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
2312             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
2313             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
2314             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
2315             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
2316             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
2317             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
2318
2319             fjx0             = _fjsp_setzero_v2r8();
2320             fjy0             = _fjsp_setzero_v2r8();
2321             fjz0             = _fjsp_setzero_v2r8();
2322             fjx1             = _fjsp_setzero_v2r8();
2323             fjy1             = _fjsp_setzero_v2r8();
2324             fjz1             = _fjsp_setzero_v2r8();
2325             fjx2             = _fjsp_setzero_v2r8();
2326             fjy2             = _fjsp_setzero_v2r8();
2327             fjz2             = _fjsp_setzero_v2r8();
2328
2329             /**************************
2330              * CALCULATE INTERACTIONS *
2331              **************************/
2332
2333             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
2334             {
2335
2336             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
2337
2338             /* EWALD ELECTROSTATICS */
2339
2340             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2341             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
2342             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2343             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2344             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2345
2346             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2347             ewtabD           = _fjsp_setzero_v2r8();
2348             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2349             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2350             ewtabFn          = _fjsp_setzero_v2r8();
2351             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2352             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2353             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2354             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
2355             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
2356
2357             d                = _fjsp_sub_v2r8(r00,rswitch);
2358             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2359             d2               = _fjsp_mul_v2r8(d,d);
2360             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2361
2362             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2363
2364             /* Evaluate switch function */
2365             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2366             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
2367             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
2368
2369             fscal            = felec;
2370
2371             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2372
2373             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2374
2375             /* Update vectorial force */
2376             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
2377             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
2378             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
2379             
2380             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
2381             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
2382             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
2383
2384             }
2385
2386             /**************************
2387              * CALCULATE INTERACTIONS *
2388              **************************/
2389
2390             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
2391             {
2392
2393             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
2394
2395             /* EWALD ELECTROSTATICS */
2396
2397             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2398             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
2399             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2400             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2401             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2402
2403             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2404             ewtabD           = _fjsp_setzero_v2r8();
2405             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2406             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2407             ewtabFn          = _fjsp_setzero_v2r8();
2408             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2409             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2410             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2411             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
2412             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
2413
2414             d                = _fjsp_sub_v2r8(r01,rswitch);
2415             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2416             d2               = _fjsp_mul_v2r8(d,d);
2417             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2418
2419             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2420
2421             /* Evaluate switch function */
2422             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2423             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
2424             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
2425
2426             fscal            = felec;
2427
2428             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2429
2430             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2431
2432             /* Update vectorial force */
2433             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
2434             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
2435             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
2436             
2437             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
2438             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
2439             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
2440
2441             }
2442
2443             /**************************
2444              * CALCULATE INTERACTIONS *
2445              **************************/
2446
2447             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
2448             {
2449
2450             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
2451
2452             /* EWALD ELECTROSTATICS */
2453
2454             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2455             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
2456             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2457             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2458             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2459
2460             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2461             ewtabD           = _fjsp_setzero_v2r8();
2462             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2463             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2464             ewtabFn          = _fjsp_setzero_v2r8();
2465             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2466             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2467             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2468             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
2469             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
2470
2471             d                = _fjsp_sub_v2r8(r02,rswitch);
2472             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2473             d2               = _fjsp_mul_v2r8(d,d);
2474             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2475
2476             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2477
2478             /* Evaluate switch function */
2479             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2480             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
2481             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
2482
2483             fscal            = felec;
2484
2485             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2486
2487             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2488
2489             /* Update vectorial force */
2490             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
2491             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
2492             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
2493             
2494             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
2495             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
2496             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
2497
2498             }
2499
2500             /**************************
2501              * CALCULATE INTERACTIONS *
2502              **************************/
2503
2504             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
2505             {
2506
2507             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
2508
2509             /* EWALD ELECTROSTATICS */
2510
2511             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2512             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
2513             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2514             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2515             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2516
2517             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2518             ewtabD           = _fjsp_setzero_v2r8();
2519             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2520             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2521             ewtabFn          = _fjsp_setzero_v2r8();
2522             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2523             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2524             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2525             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
2526             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
2527
2528             d                = _fjsp_sub_v2r8(r10,rswitch);
2529             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2530             d2               = _fjsp_mul_v2r8(d,d);
2531             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2532
2533             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2534
2535             /* Evaluate switch function */
2536             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2537             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
2538             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
2539
2540             fscal            = felec;
2541
2542             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2543
2544             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2545
2546             /* Update vectorial force */
2547             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
2548             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
2549             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
2550             
2551             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
2552             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
2553             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
2554
2555             }
2556
2557             /**************************
2558              * CALCULATE INTERACTIONS *
2559              **************************/
2560
2561             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
2562             {
2563
2564             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
2565
2566             /* EWALD ELECTROSTATICS */
2567
2568             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2569             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
2570             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2571             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2572             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2573
2574             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2575             ewtabD           = _fjsp_setzero_v2r8();
2576             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2577             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2578             ewtabFn          = _fjsp_setzero_v2r8();
2579             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2580             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2581             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2582             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
2583             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
2584
2585             d                = _fjsp_sub_v2r8(r11,rswitch);
2586             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2587             d2               = _fjsp_mul_v2r8(d,d);
2588             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2589
2590             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2591
2592             /* Evaluate switch function */
2593             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2594             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2595             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2596
2597             fscal            = felec;
2598
2599             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2600
2601             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2602
2603             /* Update vectorial force */
2604             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
2605             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2606             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2607             
2608             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2609             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2610             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2611
2612             }
2613
2614             /**************************
2615              * CALCULATE INTERACTIONS *
2616              **************************/
2617
2618             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2619             {
2620
2621             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
2622
2623             /* EWALD ELECTROSTATICS */
2624
2625             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2626             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
2627             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2628             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2629             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2630
2631             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2632             ewtabD           = _fjsp_setzero_v2r8();
2633             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2634             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2635             ewtabFn          = _fjsp_setzero_v2r8();
2636             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2637             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2638             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2639             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2640             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2641
2642             d                = _fjsp_sub_v2r8(r12,rswitch);
2643             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2644             d2               = _fjsp_mul_v2r8(d,d);
2645             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2646
2647             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2648
2649             /* Evaluate switch function */
2650             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2651             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2652             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2653
2654             fscal            = felec;
2655
2656             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2657
2658             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2659
2660             /* Update vectorial force */
2661             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
2662             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2663             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2664             
2665             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2666             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2667             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2668
2669             }
2670
2671             /**************************
2672              * CALCULATE INTERACTIONS *
2673              **************************/
2674
2675             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
2676             {
2677
2678             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
2679
2680             /* EWALD ELECTROSTATICS */
2681
2682             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2683             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
2684             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2685             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2686             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2687
2688             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2689             ewtabD           = _fjsp_setzero_v2r8();
2690             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2691             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2692             ewtabFn          = _fjsp_setzero_v2r8();
2693             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2694             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2695             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2696             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
2697             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
2698
2699             d                = _fjsp_sub_v2r8(r20,rswitch);
2700             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2701             d2               = _fjsp_mul_v2r8(d,d);
2702             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2703
2704             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2705
2706             /* Evaluate switch function */
2707             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2708             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
2709             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
2710
2711             fscal            = felec;
2712
2713             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2714
2715             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2716
2717             /* Update vectorial force */
2718             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
2719             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
2720             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
2721             
2722             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
2723             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
2724             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
2725
2726             }
2727
2728             /**************************
2729              * CALCULATE INTERACTIONS *
2730              **************************/
2731
2732             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2733             {
2734
2735             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2736
2737             /* EWALD ELECTROSTATICS */
2738
2739             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2740             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2741             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2742             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2743             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2744
2745             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2746             ewtabD           = _fjsp_setzero_v2r8();
2747             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2748             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2749             ewtabFn          = _fjsp_setzero_v2r8();
2750             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2751             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2752             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2753             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2754             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2755
2756             d                = _fjsp_sub_v2r8(r21,rswitch);
2757             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2758             d2               = _fjsp_mul_v2r8(d,d);
2759             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2760
2761             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2762
2763             /* Evaluate switch function */
2764             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2765             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2766             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2767
2768             fscal            = felec;
2769
2770             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2771
2772             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2773
2774             /* Update vectorial force */
2775             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2776             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2777             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2778             
2779             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2780             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2781             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2782
2783             }
2784
2785             /**************************
2786              * CALCULATE INTERACTIONS *
2787              **************************/
2788
2789             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2790             {
2791
2792             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2793
2794             /* EWALD ELECTROSTATICS */
2795
2796             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2797             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2798             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2799             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2800             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2801
2802             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2803             ewtabD           = _fjsp_setzero_v2r8();
2804             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2805             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2806             ewtabFn          = _fjsp_setzero_v2r8();
2807             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2808             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2809             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2810             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2811             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2812
2813             d                = _fjsp_sub_v2r8(r22,rswitch);
2814             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2815             d2               = _fjsp_mul_v2r8(d,d);
2816             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2817
2818             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2819
2820             /* Evaluate switch function */
2821             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2822             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2823             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2824
2825             fscal            = felec;
2826
2827             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2828
2829             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2830
2831             /* Update vectorial force */
2832             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2833             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2834             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2835             
2836             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2837             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2838             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2839
2840             }
2841
2842             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2843
2844             /* Inner loop uses 585 flops */
2845         }
2846
2847         /* End of innermost loop */
2848
2849         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2850                                               f+i_coord_offset,fshift+i_shift_offset);
2851
2852         /* Increment number of inner iterations */
2853         inneriter                  += j_index_end - j_index_start;
2854
2855         /* Outer loop uses 18 flops */
2856     }
2857
2858     /* Increment number of outer iterations */
2859     outeriter        += nri;
2860
2861     /* Update outer/inner flops */
2862
2863     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*585);
2864 }