2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
39 #include "nb_kernel_allvsall_sse2_single.h"
40 #include "gmx_x86_sse2.h"
43 #include <emmintrin.h>
58 real ** pvdwaram_align;
70 calc_maxoffset(int i,int natoms)
74 if ((natoms % 2) == 1)
76 /* Odd number of atoms, easy */
79 else if ((natoms % 4) == 0)
81 /* Multiple of four is hard */
101 maxoffset=natoms/2-1;
114 maxoffset=natoms/2-1;
123 setup_exclusions_and_indices_float(gmx_allvsall_data_t * aadata,
130 int ni0,ni1,nj0,nj1,nj;
132 int firstinteraction[UNROLLI];
139 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
140 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
141 * whether they should interact or not.
143 * Since we have already made an extra copy of the coordinates from natoms to 2*natoms-1,
144 * atom i should interact with j-atoms i+1 <= j < i+1+maxoffset , unless they are excluded.
145 * maxoffset is typically natoms/2, or natoms/2-1 when natoms is even and i>=natoms/2
146 * (the last is to make sure we only include i-j, and not also j-i interactions).
148 * The exclusions/inclusions are handled by a mask, set to 0xFFFFFFFF when the interaction is
149 * included, and 0 otherwise.
151 * Thus, we will have exclusion masks for:
154 * 2) Explicitly excluded atoms
157 * For any normal/sane molecule, there will only be explicit exclusions for j=i+n, with n quite small (1..10).
158 * By calculating this range, we can split the kernel into three parts:
160 * A) A prologue, where we have exclusion masks to check for all three types of exclusions
161 * (for very small systems this is the only executed part)
162 * B) A main part, where no atoms are excluded
163 * C) An epilogue, where we only take the last type of exclusions into account
166 * So, this means we need to exclusion mask lists:
168 * - One for the first few atoms above i (up to the max explicit exclusion range)
169 * - One for the last few atoms around i+maxoffset
172 * Since the SIMD code needs to load aligned coordinates, the masks too will be aligned, i.e. the mask for atom
173 * 5 will really start on atom 4. In addition, the kernel will be unrolled both in I and J (think 4*4 tiles),
174 * so we should have groups of 4 i-atoms whose exclusion masks start at the same offset, and have the same length.
177 /* First create a simple mask to say which i atoms should be included. This is useful when the start/end positions
178 * are not multiples of UNROLLI.
181 /* Example: if start=5 and end=17, we will get ni=4 and ni1=20.
182 * The i loop will this go over atoms 4, 17, 18, and 19 and addition to the ones we want to include.
184 ni0 = (start/UNROLLI)*UNROLLI;
185 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
187 /* Set the interaction mask to only enable the i atoms we want to include */
188 snew(pi,natoms+UNROLLI+2*SIMD_WIDTH);
189 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
190 for(i=0;i<natoms+UNROLLI;i++)
192 aadata->imask[i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
195 /* Allocate memory for our modified jindex array */
196 snew(aadata->jindex,4*(natoms+UNROLLI));
197 for(i=0;i<4*(natoms+UNROLLI);i++)
199 aadata->jindex[i] = 0;
202 /* Create the exclusion masks for the prologue part */
203 snew(aadata->prologue_mask,natoms+UNROLLI); /* list of pointers */
205 /* First zero everything to avoid uninitialized data */
206 for(i=0;i<natoms+UNROLLI;i++)
208 aadata->prologue_mask[i] = NULL;
211 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
212 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
214 max_excl_offset = -1;
216 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
217 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
219 for(i=ibase;i<imax;i++)
221 /* Before exclusions, which atom is the first we (might) interact with? */
222 firstinteraction[i-ibase] = i+1;
223 max_offset = calc_maxoffset(i,natoms);
225 nj0 = excl->index[i];
226 nj1 = excl->index[i+1];
227 for(j=nj0; j<nj1; j++)
229 if(excl->a[j]>i+max_offset)
234 k = excl->a[j] - ibase;
236 if( k+natoms <= max_offset )
241 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
243 /* Exclusions are sorted, so this can be done iteratively */
244 if(excl->a[j] == firstinteraction[i-ibase])
246 firstinteraction[i-ibase]++;
251 /* round up to j unrolling factor */
252 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
254 imin = firstinteraction[0];
255 for(i=ibase;i<imax;i++)
257 imin = (imin < firstinteraction[i-ibase]) ? imin : firstinteraction[i-ibase];
259 imin = (imin/UNROLLJ)*UNROLLJ;
261 /* Set all the prologue masks length to this value (even for i>end) */
262 for(i=ibase;i<ibase+UNROLLI;i++)
264 aadata->jindex[4*i] = imin;
265 aadata->jindex[4*i+1] = ibase+max_excl_offset;
269 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
270 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
272 for(i=ibase;i<ibase+UNROLLI;i++)
274 nj = aadata->jindex[4*i+1] - aadata->jindex[4*i];
275 imin = aadata->jindex[4*i];
277 /* Allocate aligned memory */
278 snew(pi,nj+2*SIMD_WIDTH);
279 aadata->prologue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
281 /* If natoms is odd, maxoffset=natoms/2
282 * If natoms is even, maxoffset=natoms/2 for even atoms, natoms/2-1 for odd atoms.
284 max_offset = calc_maxoffset(i,natoms);
286 /* Include interactions i+1 <= j < i+maxoffset */
291 if( (j>i) && (j<=i+max_offset) )
293 aadata->prologue_mask[i][k] = 0xFFFFFFFF;
297 aadata->prologue_mask[i][k] = 0;
301 /* Clear out the explicit exclusions */
304 nj0 = excl->index[i];
305 nj1 = excl->index[i+1];
306 for(j=nj0; j<nj1; j++)
308 if(excl->a[j]>i+max_offset)
314 if( k+natoms <= max_offset )
322 aadata->prologue_mask[i][k] = 0;
329 /* Construct the epilogue mask - this just contains the check for maxoffset */
330 snew(aadata->epilogue_mask,natoms+UNROLLI);
332 /* First zero everything to avoid uninitialized data */
333 for(i=0;i<natoms+UNROLLI;i++)
335 aadata->jindex[4*i+2] = aadata->jindex[4*i+1];
336 aadata->jindex[4*i+3] = aadata->jindex[4*i+1];
337 aadata->epilogue_mask[i] = NULL;
340 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
342 /* Find the lowest index for which we need to use the epilogue */
344 max_offset = calc_maxoffset(imin,natoms);
346 imin = imin + 1 + max_offset;
348 /* Find largest index for which we need to use the epilogue */
349 imax = ibase + UNROLLI-1;
350 imax = (imax < end) ? imax : end;
352 /* imax can be either odd or even */
353 max_offset = calc_maxoffset(imax,natoms);
355 imax = imax + 1 + max_offset + UNROLLJ - 1;
357 for(i=ibase;i<ibase+UNROLLI;i++)
359 /* Start of epilogue - round down to j tile limit */
360 aadata->jindex[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
361 /* Make sure we dont overlap - for small systems everything is done in the prologue */
362 aadata->jindex[4*i+2] = (aadata->jindex[4*i+1] > aadata->jindex[4*i+2]) ? aadata->jindex[4*i+1] : aadata->jindex[4*i+2];
363 /* Round upwards to j tile limit */
364 aadata->jindex[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
365 /* Make sure we dont have a negative range for the epilogue */
366 aadata->jindex[4*i+3] = (aadata->jindex[4*i+2] > aadata->jindex[4*i+3]) ? aadata->jindex[4*i+2] : aadata->jindex[4*i+3];
370 /* And fill it with data... */
372 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
374 for(i=ibase;i<ibase+UNROLLI;i++)
377 nj = aadata->jindex[4*i+3] - aadata->jindex[4*i+2];
379 /* Allocate aligned memory */
380 snew(pi,nj+2*SIMD_WIDTH);
381 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
383 max_offset = calc_maxoffset(i,natoms);
387 j = aadata->jindex[4*i+2] + k;
388 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
395 setup_aadata(gmx_allvsall_data_t ** p_aadata,
405 gmx_allvsall_data_t *aadata;
413 snew(pr,2*natoms+2*SIMD_WIDTH);
414 aadata->x_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
415 snew(pr,2*natoms+2*SIMD_WIDTH);
416 aadata->y_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
417 snew(pr,2*natoms+2*SIMD_WIDTH);
418 aadata->z_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
419 snew(pr,2*natoms+2*SIMD_WIDTH);
420 aadata->fx_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
421 snew(pr,2*natoms+2*SIMD_WIDTH);
422 aadata->fy_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
423 snew(pr,2*natoms+2*SIMD_WIDTH);
424 aadata->fz_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
425 snew(pr,2*natoms+2*SIMD_WIDTH);
426 aadata->q_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
428 for(i=0;i<2*natoms+SIMD_WIDTH;i++)
430 aadata->x_align[i] = 0.0;
431 aadata->y_align[i] = 0.0;
432 aadata->z_align[i] = 0.0;
433 aadata->q_align[i] = 0.0;
434 aadata->fx_align[i] = 0.0;
435 aadata->fy_align[i] = 0.0;
436 aadata->fz_align[i] = 0.0;
439 /* Generate vdw params */
440 snew(aadata->pvdwaram_align,ntype);
441 snew(c6tmp,2*natoms+SIMD_WIDTH);
442 snew(c12tmp,2*natoms+SIMD_WIDTH);
446 /* Note that this 4 does NOT refer to SIMD_WIDTH, but to c6 & c12 params for 2*natoms! */
447 snew(pr,4*natoms+4*SIMD_WIDTH);
448 aadata->pvdwaram_align[i] = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
449 p=aadata->pvdwaram_align[i];
451 /* Lets keep it simple and use multiple steps - first create temp. c6/c12 arrays */
452 for(j=0;j<natoms;j++)
454 idx = i*ntype+type[j];
455 c6tmp[j] = pvdwaram[2*idx];
456 c12tmp[j] = pvdwaram[2*idx+1];
457 c6tmp[natoms+j] = c6tmp[j];
458 c12tmp[natoms+j] = c12tmp[j];
460 for(j=2*natoms;j<2*natoms+SIMD_WIDTH;j++)
466 /* Merge into a single array: c6,c6,c6,c6,c12,c12,c12,c12,c6,c6,c6,c6,c12,c12,c12,c12,etc. */
468 for(j=0;j<2*natoms;j+=UNROLLJ)
470 for(k=0;k<UNROLLJ;k++)
473 p[2*j+k] = c6tmp[idx];
474 p[2*j+UNROLLJ+k] = c12tmp[idx];
481 snew(aadata->ppvdw,natoms+UNROLLI);
482 for(i=0;i<natoms;i++)
484 aadata->ppvdw[i] = aadata->pvdwaram_align[type[i]];
486 for(i=natoms;i<natoms+UNROLLI;i++)
488 aadata->ppvdw[i] = aadata->pvdwaram_align[0];
491 setup_exclusions_and_indices_float(aadata,excl,start,end,natoms);
498 nb_kernel_allvsall_sse2_single(t_forcerec * fr,
516 real ** pvdwaram_align;
523 gmx_allvsall_data_t *aadata;
533 int ** prologue_mask;
534 int ** epilogue_mask;
546 __m128 ix_SSE0,iy_SSE0,iz_SSE0;
547 __m128 ix_SSE1,iy_SSE1,iz_SSE1;
548 __m128 ix_SSE2,iy_SSE2,iz_SSE2;
549 __m128 ix_SSE3,iy_SSE3,iz_SSE3;
550 __m128 fix_SSE0,fiy_SSE0,fiz_SSE0;
551 __m128 fix_SSE1,fiy_SSE1,fiz_SSE1;
552 __m128 fix_SSE2,fiy_SSE2,fiz_SSE2;
553 __m128 fix_SSE3,fiy_SSE3,fiz_SSE3;
554 __m128 fjxSSE,fjySSE,fjzSSE;
555 __m128 jxSSE,jySSE,jzSSE,jqSSE;
556 __m128 dx_SSE0,dy_SSE0,dz_SSE0;
557 __m128 dx_SSE1,dy_SSE1,dz_SSE1;
558 __m128 dx_SSE2,dy_SSE2,dz_SSE2;
559 __m128 dx_SSE3,dy_SSE3,dz_SSE3;
560 __m128 tx_SSE0,ty_SSE0,tz_SSE0;
561 __m128 tx_SSE1,ty_SSE1,tz_SSE1;
562 __m128 tx_SSE2,ty_SSE2,tz_SSE2;
563 __m128 tx_SSE3,ty_SSE3,tz_SSE3;
564 __m128 rsq_SSE0,rinv_SSE0,rinvsq_SSE0,rinvsix_SSE0;
565 __m128 rsq_SSE1,rinv_SSE1,rinvsq_SSE1,rinvsix_SSE1;
566 __m128 rsq_SSE2,rinv_SSE2,rinvsq_SSE2,rinvsix_SSE2;
567 __m128 rsq_SSE3,rinv_SSE3,rinvsq_SSE3,rinvsix_SSE3;
568 __m128 qq_SSE0,iq_SSE0;
569 __m128 qq_SSE1,iq_SSE1;
570 __m128 qq_SSE2,iq_SSE2;
571 __m128 qq_SSE3,iq_SSE3;
572 __m128 vcoul_SSE0,Vvdw6_SSE0,Vvdw12_SSE0,fscal_SSE0;
573 __m128 vcoul_SSE1,Vvdw6_SSE1,Vvdw12_SSE1,fscal_SSE1;
574 __m128 vcoul_SSE2,Vvdw6_SSE2,Vvdw12_SSE2,fscal_SSE2;
575 __m128 vcoul_SSE3,Vvdw6_SSE3,Vvdw12_SSE3,fscal_SSE3;
576 __m128 c6_SSE0,c12_SSE0;
577 __m128 c6_SSE1,c12_SSE1;
578 __m128 c6_SSE2,c12_SSE2;
579 __m128 c6_SSE3,c12_SSE3;
581 __m128 vctotSSE,VvdwtotSSE;
582 __m128 sixSSE,twelveSSE;
583 __m128i ikSSE,imSSE,ifourSSE;
585 __m128 imask_SSE0,imask_SSE1,imask_SSE2,imask_SSE3;
586 __m128 jmask_SSE0,jmask_SSE1,jmask_SSE2,jmask_SSE3;
588 charge = mdatoms->chargeA;
589 type = mdatoms->typeA;
592 natoms = mdatoms->nr;
593 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
594 ni1 = mdatoms->start+mdatoms->homenr;
596 sixSSE = _mm_set1_ps(6.0);
597 twelveSSE = _mm_set1_ps(12.0);
598 ifourSSE = _mm_set1_epi32(4);
599 ioneSSE = _mm_set1_epi32(1);
601 aadata = *((gmx_allvsall_data_t **)work);
605 setup_aadata(&aadata,excl,mdatoms->start,mdatoms->start+mdatoms->homenr,natoms,type,fr->ntype,fr->nbfp);
606 *((gmx_allvsall_data_t **)work) = aadata;
609 x_align = aadata->x_align;
610 y_align = aadata->y_align;
611 z_align = aadata->z_align;
612 fx_align = aadata->fx_align;
613 fy_align = aadata->fy_align;
614 fz_align = aadata->fz_align;
615 q_align = aadata->q_align;
616 pvdwaram_align = aadata->pvdwaram_align;
617 ppvdw = aadata->ppvdw;
619 prologue_mask = aadata->prologue_mask;
620 epilogue_mask = aadata->epilogue_mask;
621 jindex = aadata->jindex;
622 imask = aadata->imask;
624 for(i=ni0;i<ni1+1+natoms/2;i++)
628 y_align[i] = x[3*k+1];
629 z_align[i] = x[3*k+2];
630 q_align[i] = charge[k];
636 for(i=ni0; i<ni1; i+=UNROLLI)
638 /* We assume shifts are NOT used for all-vs-all interactions */
640 /* Load i atom data */
641 ix_SSE0 = _mm_load1_ps(x_align+i);
642 ix_SSE1 = _mm_load1_ps(x_align+i+1);
643 ix_SSE2 = _mm_load1_ps(x_align+i+2);
644 ix_SSE3 = _mm_load1_ps(x_align+i+3);
645 iy_SSE0 = _mm_load1_ps(y_align+i);
646 iy_SSE1 = _mm_load1_ps(y_align+i+1);
647 iy_SSE2 = _mm_load1_ps(y_align+i+2);
648 iy_SSE3 = _mm_load1_ps(y_align+i+3);
649 iz_SSE0 = _mm_load1_ps(z_align+i);
650 iz_SSE1 = _mm_load1_ps(z_align+i+1);
651 iz_SSE2 = _mm_load1_ps(z_align+i+2);
652 iz_SSE3 = _mm_load1_ps(z_align+i+3);
653 iq_SSE0 = _mm_set1_ps(facel*q_align[i]);
654 iq_SSE1 = _mm_set1_ps(facel*q_align[i+1]);
655 iq_SSE2 = _mm_set1_ps(facel*q_align[i+2]);
656 iq_SSE3 = _mm_set1_ps(facel*q_align[i+3]);
663 /* Zero the potential energy for this list */
664 VvdwtotSSE = _mm_setzero_ps();
665 vctotSSE = _mm_setzero_ps();
667 /* Clear i atom forces */
668 fix_SSE0 = _mm_setzero_ps();
669 fix_SSE1 = _mm_setzero_ps();
670 fix_SSE2 = _mm_setzero_ps();
671 fix_SSE3 = _mm_setzero_ps();
672 fiy_SSE0 = _mm_setzero_ps();
673 fiy_SSE1 = _mm_setzero_ps();
674 fiy_SSE2 = _mm_setzero_ps();
675 fiy_SSE3 = _mm_setzero_ps();
676 fiz_SSE0 = _mm_setzero_ps();
677 fiz_SSE1 = _mm_setzero_ps();
678 fiz_SSE2 = _mm_setzero_ps();
679 fiz_SSE3 = _mm_setzero_ps();
681 /* Load limits for loop over neighbors */
687 pmask0 = prologue_mask[i];
688 pmask1 = prologue_mask[i+1];
689 pmask2 = prologue_mask[i+2];
690 pmask3 = prologue_mask[i+3];
691 emask0 = epilogue_mask[i];
692 emask1 = epilogue_mask[i+1];
693 emask2 = epilogue_mask[i+2];
694 emask3 = epilogue_mask[i+3];
695 imask_SSE0 = _mm_load1_ps((real *)(imask+i));
696 imask_SSE1 = _mm_load1_ps((real *)(imask+i+1));
697 imask_SSE2 = _mm_load1_ps((real *)(imask+i+2));
698 imask_SSE3 = _mm_load1_ps((real *)(imask+i+3));
700 for(j=nj0; j<nj1; j+=UNROLLJ)
702 jmask_SSE0 = _mm_load_ps((real *)pmask0);
703 jmask_SSE1 = _mm_load_ps((real *)pmask1);
704 jmask_SSE2 = _mm_load_ps((real *)pmask2);
705 jmask_SSE3 = _mm_load_ps((real *)pmask3);
711 /* load j atom coordinates */
712 jxSSE = _mm_load_ps(x_align+j);
713 jySSE = _mm_load_ps(y_align+j);
714 jzSSE = _mm_load_ps(z_align+j);
716 /* Calculate distance */
717 dx_SSE0 = _mm_sub_ps(ix_SSE0,jxSSE);
718 dy_SSE0 = _mm_sub_ps(iy_SSE0,jySSE);
719 dz_SSE0 = _mm_sub_ps(iz_SSE0,jzSSE);
720 dx_SSE1 = _mm_sub_ps(ix_SSE1,jxSSE);
721 dy_SSE1 = _mm_sub_ps(iy_SSE1,jySSE);
722 dz_SSE1 = _mm_sub_ps(iz_SSE1,jzSSE);
723 dx_SSE2 = _mm_sub_ps(ix_SSE2,jxSSE);
724 dy_SSE2 = _mm_sub_ps(iy_SSE2,jySSE);
725 dz_SSE2 = _mm_sub_ps(iz_SSE2,jzSSE);
726 dx_SSE3 = _mm_sub_ps(ix_SSE3,jxSSE);
727 dy_SSE3 = _mm_sub_ps(iy_SSE3,jySSE);
728 dz_SSE3 = _mm_sub_ps(iz_SSE3,jzSSE);
730 /* rsq = dx*dx+dy*dy+dz*dz */
731 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
732 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
733 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
734 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
737 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
738 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
739 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
740 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
742 /* Calculate 1/r and 1/r2 */
743 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
744 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
745 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
746 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
749 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
750 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
751 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
752 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
754 /* Load parameters for j atom */
755 jqSSE = _mm_load_ps(q_align+j);
756 qq_SSE0 = _mm_mul_ps(iq_SSE0,jqSSE);
757 qq_SSE1 = _mm_mul_ps(iq_SSE1,jqSSE);
758 qq_SSE2 = _mm_mul_ps(iq_SSE2,jqSSE);
759 qq_SSE3 = _mm_mul_ps(iq_SSE3,jqSSE);
761 c6_SSE0 = _mm_load_ps(pvdw0+2*j);
762 c6_SSE1 = _mm_load_ps(pvdw1+2*j);
763 c6_SSE2 = _mm_load_ps(pvdw2+2*j);
764 c6_SSE3 = _mm_load_ps(pvdw3+2*j);
765 c12_SSE0 = _mm_load_ps(pvdw0+2*j+UNROLLJ);
766 c12_SSE1 = _mm_load_ps(pvdw1+2*j+UNROLLJ);
767 c12_SSE2 = _mm_load_ps(pvdw2+2*j+UNROLLJ);
768 c12_SSE3 = _mm_load_ps(pvdw3+2*j+UNROLLJ);
770 rinvsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
771 rinvsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
772 rinvsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
773 rinvsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
775 /* Coulomb interaction */
776 vcoul_SSE0 = _mm_mul_ps(qq_SSE0,rinv_SSE0);
777 vcoul_SSE1 = _mm_mul_ps(qq_SSE1,rinv_SSE1);
778 vcoul_SSE2 = _mm_mul_ps(qq_SSE2,rinv_SSE2);
779 vcoul_SSE3 = _mm_mul_ps(qq_SSE3,rinv_SSE3);
781 vctotSSE = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
783 /* Lennard-Jones interaction */
784 rinvsix_SSE0 = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
785 rinvsix_SSE1 = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
786 rinvsix_SSE2 = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
787 rinvsix_SSE3 = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
788 Vvdw6_SSE0 = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
789 Vvdw6_SSE1 = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
790 Vvdw6_SSE2 = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
791 Vvdw6_SSE3 = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
792 Vvdw12_SSE0 = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
793 Vvdw12_SSE1 = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
794 Vvdw12_SSE2 = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
795 Vvdw12_SSE3 = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
797 VvdwtotSSE = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
798 _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
799 _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
800 _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
802 fscal_SSE0 = _mm_mul_ps(rinvsq_SSE0,
803 _mm_add_ps(vcoul_SSE0,
804 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
805 _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
806 fscal_SSE1 = _mm_mul_ps(rinvsq_SSE1,
807 _mm_add_ps(vcoul_SSE1,
808 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
809 _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
810 fscal_SSE2 = _mm_mul_ps(rinvsq_SSE2,
811 _mm_add_ps(vcoul_SSE2,
812 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
813 _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
814 fscal_SSE3 = _mm_mul_ps(rinvsq_SSE3,
815 _mm_add_ps(vcoul_SSE3,
816 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
817 _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
819 /* Calculate temporary vectorial force */
820 tx_SSE0 = _mm_mul_ps(fscal_SSE0,dx_SSE0);
821 tx_SSE1 = _mm_mul_ps(fscal_SSE1,dx_SSE1);
822 tx_SSE2 = _mm_mul_ps(fscal_SSE2,dx_SSE2);
823 tx_SSE3 = _mm_mul_ps(fscal_SSE3,dx_SSE3);
824 ty_SSE0 = _mm_mul_ps(fscal_SSE0,dy_SSE0);
825 ty_SSE1 = _mm_mul_ps(fscal_SSE1,dy_SSE1);
826 ty_SSE2 = _mm_mul_ps(fscal_SSE2,dy_SSE2);
827 ty_SSE3 = _mm_mul_ps(fscal_SSE3,dy_SSE3);
828 tz_SSE0 = _mm_mul_ps(fscal_SSE0,dz_SSE0);
829 tz_SSE1 = _mm_mul_ps(fscal_SSE1,dz_SSE1);
830 tz_SSE2 = _mm_mul_ps(fscal_SSE2,dz_SSE2);
831 tz_SSE3 = _mm_mul_ps(fscal_SSE3,dz_SSE3);
833 /* Increment i atom force */
834 fix_SSE0 = _mm_add_ps(fix_SSE0,tx_SSE0);
835 fix_SSE1 = _mm_add_ps(fix_SSE1,tx_SSE1);
836 fix_SSE2 = _mm_add_ps(fix_SSE2,tx_SSE2);
837 fix_SSE3 = _mm_add_ps(fix_SSE3,tx_SSE3);
838 fiy_SSE0 = _mm_add_ps(fiy_SSE0,ty_SSE0);
839 fiy_SSE1 = _mm_add_ps(fiy_SSE1,ty_SSE1);
840 fiy_SSE2 = _mm_add_ps(fiy_SSE2,ty_SSE2);
841 fiy_SSE3 = _mm_add_ps(fiy_SSE3,ty_SSE3);
842 fiz_SSE0 = _mm_add_ps(fiz_SSE0,tz_SSE0);
843 fiz_SSE1 = _mm_add_ps(fiz_SSE1,tz_SSE1);
844 fiz_SSE2 = _mm_add_ps(fiz_SSE2,tz_SSE2);
845 fiz_SSE3 = _mm_add_ps(fiz_SSE3,tz_SSE3);
847 /* Decrement j atom force */
848 _mm_store_ps(fx_align+j,
849 _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
850 _mm_store_ps(fy_align+j,
851 _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
852 _mm_store_ps(fz_align+j,
853 _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
856 /* Inner loop uses 38 flops/iteration */
859 for(j=nj1; j<nj2; j+=UNROLLJ)
861 /* load j atom coordinates */
862 jxSSE = _mm_load_ps(x_align+j);
863 jySSE = _mm_load_ps(y_align+j);
864 jzSSE = _mm_load_ps(z_align+j);
866 /* Calculate distance */
867 dx_SSE0 = _mm_sub_ps(ix_SSE0,jxSSE);
868 dy_SSE0 = _mm_sub_ps(iy_SSE0,jySSE);
869 dz_SSE0 = _mm_sub_ps(iz_SSE0,jzSSE);
870 dx_SSE1 = _mm_sub_ps(ix_SSE1,jxSSE);
871 dy_SSE1 = _mm_sub_ps(iy_SSE1,jySSE);
872 dz_SSE1 = _mm_sub_ps(iz_SSE1,jzSSE);
873 dx_SSE2 = _mm_sub_ps(ix_SSE2,jxSSE);
874 dy_SSE2 = _mm_sub_ps(iy_SSE2,jySSE);
875 dz_SSE2 = _mm_sub_ps(iz_SSE2,jzSSE);
876 dx_SSE3 = _mm_sub_ps(ix_SSE3,jxSSE);
877 dy_SSE3 = _mm_sub_ps(iy_SSE3,jySSE);
878 dz_SSE3 = _mm_sub_ps(iz_SSE3,jzSSE);
880 /* rsq = dx*dx+dy*dy+dz*dz */
881 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
882 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
883 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
884 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
886 /* Calculate 1/r and 1/r2 */
887 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
888 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
889 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
890 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
892 rinv_SSE0 = _mm_and_ps(rinv_SSE0,imask_SSE0);
893 rinv_SSE1 = _mm_and_ps(rinv_SSE1,imask_SSE1);
894 rinv_SSE2 = _mm_and_ps(rinv_SSE2,imask_SSE2);
895 rinv_SSE3 = _mm_and_ps(rinv_SSE3,imask_SSE3);
897 /* Load parameters for j atom */
898 jqSSE = _mm_load_ps(q_align+j);
899 qq_SSE0 = _mm_mul_ps(iq_SSE0,jqSSE);
900 qq_SSE1 = _mm_mul_ps(iq_SSE1,jqSSE);
901 qq_SSE2 = _mm_mul_ps(iq_SSE2,jqSSE);
902 qq_SSE3 = _mm_mul_ps(iq_SSE3,jqSSE);
904 c6_SSE0 = _mm_load_ps(pvdw0+2*j);
905 c6_SSE1 = _mm_load_ps(pvdw1+2*j);
906 c6_SSE2 = _mm_load_ps(pvdw2+2*j);
907 c6_SSE3 = _mm_load_ps(pvdw3+2*j);
908 c12_SSE0 = _mm_load_ps(pvdw0+2*j+UNROLLJ);
909 c12_SSE1 = _mm_load_ps(pvdw1+2*j+UNROLLJ);
910 c12_SSE2 = _mm_load_ps(pvdw2+2*j+UNROLLJ);
911 c12_SSE3 = _mm_load_ps(pvdw3+2*j+UNROLLJ);
913 rinvsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
914 rinvsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
915 rinvsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
916 rinvsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
918 /* Coulomb interaction */
919 vcoul_SSE0 = _mm_mul_ps(qq_SSE0,rinv_SSE0);
920 vcoul_SSE1 = _mm_mul_ps(qq_SSE1,rinv_SSE1);
921 vcoul_SSE2 = _mm_mul_ps(qq_SSE2,rinv_SSE2);
922 vcoul_SSE3 = _mm_mul_ps(qq_SSE3,rinv_SSE3);
924 /* Lennard-Jones interaction */
925 rinvsix_SSE0 = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
926 rinvsix_SSE1 = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
927 rinvsix_SSE2 = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
928 rinvsix_SSE3 = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
929 Vvdw6_SSE0 = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
930 Vvdw6_SSE1 = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
931 Vvdw6_SSE2 = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
932 Vvdw6_SSE3 = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
933 Vvdw12_SSE0 = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
934 Vvdw12_SSE1 = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
935 Vvdw12_SSE2 = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
936 Vvdw12_SSE3 = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
938 vctotSSE = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
939 VvdwtotSSE = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
940 _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
941 _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
942 _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
944 fscal_SSE0 = _mm_mul_ps(rinvsq_SSE0,
945 _mm_add_ps(vcoul_SSE0,
946 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
947 _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
948 fscal_SSE1 = _mm_mul_ps(rinvsq_SSE1,
949 _mm_add_ps(vcoul_SSE1,
950 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
951 _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
952 fscal_SSE2 = _mm_mul_ps(rinvsq_SSE2,
953 _mm_add_ps(vcoul_SSE2,
954 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
955 _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
956 fscal_SSE3 = _mm_mul_ps(rinvsq_SSE3,
957 _mm_add_ps(vcoul_SSE3,
958 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
959 _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
961 /* Calculate temporary vectorial force */
962 tx_SSE0 = _mm_mul_ps(fscal_SSE0,dx_SSE0);
963 tx_SSE1 = _mm_mul_ps(fscal_SSE1,dx_SSE1);
964 tx_SSE2 = _mm_mul_ps(fscal_SSE2,dx_SSE2);
965 tx_SSE3 = _mm_mul_ps(fscal_SSE3,dx_SSE3);
966 ty_SSE0 = _mm_mul_ps(fscal_SSE0,dy_SSE0);
967 ty_SSE1 = _mm_mul_ps(fscal_SSE1,dy_SSE1);
968 ty_SSE2 = _mm_mul_ps(fscal_SSE2,dy_SSE2);
969 ty_SSE3 = _mm_mul_ps(fscal_SSE3,dy_SSE3);
970 tz_SSE0 = _mm_mul_ps(fscal_SSE0,dz_SSE0);
971 tz_SSE1 = _mm_mul_ps(fscal_SSE1,dz_SSE1);
972 tz_SSE2 = _mm_mul_ps(fscal_SSE2,dz_SSE2);
973 tz_SSE3 = _mm_mul_ps(fscal_SSE3,dz_SSE3);
975 /* Increment i atom force */
976 fix_SSE0 = _mm_add_ps(fix_SSE0,tx_SSE0);
977 fix_SSE1 = _mm_add_ps(fix_SSE1,tx_SSE1);
978 fix_SSE2 = _mm_add_ps(fix_SSE2,tx_SSE2);
979 fix_SSE3 = _mm_add_ps(fix_SSE3,tx_SSE3);
980 fiy_SSE0 = _mm_add_ps(fiy_SSE0,ty_SSE0);
981 fiy_SSE1 = _mm_add_ps(fiy_SSE1,ty_SSE1);
982 fiy_SSE2 = _mm_add_ps(fiy_SSE2,ty_SSE2);
983 fiy_SSE3 = _mm_add_ps(fiy_SSE3,ty_SSE3);
984 fiz_SSE0 = _mm_add_ps(fiz_SSE0,tz_SSE0);
985 fiz_SSE1 = _mm_add_ps(fiz_SSE1,tz_SSE1);
986 fiz_SSE2 = _mm_add_ps(fiz_SSE2,tz_SSE2);
987 fiz_SSE3 = _mm_add_ps(fiz_SSE3,tz_SSE3);
989 /* Decrement j atom force */
990 _mm_store_ps(fx_align+j,
991 _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
992 _mm_store_ps(fy_align+j,
993 _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
994 _mm_store_ps(fz_align+j,
995 _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
997 /* Inner loop uses 38 flops/iteration */
1000 for(j=nj2; j<nj3; j+=UNROLLJ)
1002 jmask_SSE0 = _mm_load_ps((real *)emask0);
1003 jmask_SSE1 = _mm_load_ps((real *)emask1);
1004 jmask_SSE2 = _mm_load_ps((real *)emask2);
1005 jmask_SSE3 = _mm_load_ps((real *)emask3);
1011 /* load j atom coordinates */
1012 jxSSE = _mm_load_ps(x_align+j);
1013 jySSE = _mm_load_ps(y_align+j);
1014 jzSSE = _mm_load_ps(z_align+j);
1016 /* Calculate distance */
1017 dx_SSE0 = _mm_sub_ps(ix_SSE0,jxSSE);
1018 dy_SSE0 = _mm_sub_ps(iy_SSE0,jySSE);
1019 dz_SSE0 = _mm_sub_ps(iz_SSE0,jzSSE);
1020 dx_SSE1 = _mm_sub_ps(ix_SSE1,jxSSE);
1021 dy_SSE1 = _mm_sub_ps(iy_SSE1,jySSE);
1022 dz_SSE1 = _mm_sub_ps(iz_SSE1,jzSSE);
1023 dx_SSE2 = _mm_sub_ps(ix_SSE2,jxSSE);
1024 dy_SSE2 = _mm_sub_ps(iy_SSE2,jySSE);
1025 dz_SSE2 = _mm_sub_ps(iz_SSE2,jzSSE);
1026 dx_SSE3 = _mm_sub_ps(ix_SSE3,jxSSE);
1027 dy_SSE3 = _mm_sub_ps(iy_SSE3,jySSE);
1028 dz_SSE3 = _mm_sub_ps(iz_SSE3,jzSSE);
1030 /* rsq = dx*dx+dy*dy+dz*dz */
1031 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1032 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1033 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1034 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1036 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
1037 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
1038 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
1039 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
1041 /* Calculate 1/r and 1/r2 */
1042 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1043 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1044 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1045 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1046 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1047 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1048 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1049 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1051 /* Load parameters for j atom */
1052 jqSSE = _mm_load_ps(q_align+j);
1053 qq_SSE0 = _mm_mul_ps(iq_SSE0,jqSSE);
1054 qq_SSE1 = _mm_mul_ps(iq_SSE1,jqSSE);
1055 qq_SSE2 = _mm_mul_ps(iq_SSE2,jqSSE);
1056 qq_SSE3 = _mm_mul_ps(iq_SSE3,jqSSE);
1058 c6_SSE0 = _mm_load_ps(pvdw0+2*j);
1059 c6_SSE1 = _mm_load_ps(pvdw1+2*j);
1060 c6_SSE2 = _mm_load_ps(pvdw2+2*j);
1061 c6_SSE3 = _mm_load_ps(pvdw3+2*j);
1062 c12_SSE0 = _mm_load_ps(pvdw0+2*j+UNROLLJ);
1063 c12_SSE1 = _mm_load_ps(pvdw1+2*j+UNROLLJ);
1064 c12_SSE2 = _mm_load_ps(pvdw2+2*j+UNROLLJ);
1065 c12_SSE3 = _mm_load_ps(pvdw3+2*j+UNROLLJ);
1067 rinvsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1068 rinvsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1069 rinvsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1070 rinvsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1072 /* Coulomb interaction */
1073 vcoul_SSE0 = _mm_mul_ps(qq_SSE0,rinv_SSE0);
1074 vcoul_SSE1 = _mm_mul_ps(qq_SSE1,rinv_SSE1);
1075 vcoul_SSE2 = _mm_mul_ps(qq_SSE2,rinv_SSE2);
1076 vcoul_SSE3 = _mm_mul_ps(qq_SSE3,rinv_SSE3);
1078 vctotSSE = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
1080 /* Lennard-Jones interaction */
1081 rinvsix_SSE0 = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
1082 rinvsix_SSE1 = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
1083 rinvsix_SSE2 = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
1084 rinvsix_SSE3 = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
1085 Vvdw6_SSE0 = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
1086 Vvdw6_SSE1 = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
1087 Vvdw6_SSE2 = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
1088 Vvdw6_SSE3 = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
1089 Vvdw12_SSE0 = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
1090 Vvdw12_SSE1 = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
1091 Vvdw12_SSE2 = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
1092 Vvdw12_SSE3 = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
1094 VvdwtotSSE = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
1095 _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
1096 _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
1097 _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
1099 fscal_SSE0 = _mm_mul_ps(rinvsq_SSE0,
1100 _mm_add_ps(vcoul_SSE0,
1101 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
1102 _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
1103 fscal_SSE1 = _mm_mul_ps(rinvsq_SSE1,
1104 _mm_add_ps(vcoul_SSE1,
1105 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
1106 _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
1107 fscal_SSE2 = _mm_mul_ps(rinvsq_SSE2,
1108 _mm_add_ps(vcoul_SSE2,
1109 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
1110 _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
1111 fscal_SSE3 = _mm_mul_ps(rinvsq_SSE3,
1112 _mm_add_ps(vcoul_SSE3,
1113 _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
1114 _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
1116 /* Calculate temporary vectorial force */
1117 tx_SSE0 = _mm_mul_ps(fscal_SSE0,dx_SSE0);
1118 tx_SSE1 = _mm_mul_ps(fscal_SSE1,dx_SSE1);
1119 tx_SSE2 = _mm_mul_ps(fscal_SSE2,dx_SSE2);
1120 tx_SSE3 = _mm_mul_ps(fscal_SSE3,dx_SSE3);
1121 ty_SSE0 = _mm_mul_ps(fscal_SSE0,dy_SSE0);
1122 ty_SSE1 = _mm_mul_ps(fscal_SSE1,dy_SSE1);
1123 ty_SSE2 = _mm_mul_ps(fscal_SSE2,dy_SSE2);
1124 ty_SSE3 = _mm_mul_ps(fscal_SSE3,dy_SSE3);
1125 tz_SSE0 = _mm_mul_ps(fscal_SSE0,dz_SSE0);
1126 tz_SSE1 = _mm_mul_ps(fscal_SSE1,dz_SSE1);
1127 tz_SSE2 = _mm_mul_ps(fscal_SSE2,dz_SSE2);
1128 tz_SSE3 = _mm_mul_ps(fscal_SSE3,dz_SSE3);
1130 /* Increment i atom force */
1131 fix_SSE0 = _mm_add_ps(fix_SSE0,tx_SSE0);
1132 fix_SSE1 = _mm_add_ps(fix_SSE1,tx_SSE1);
1133 fix_SSE2 = _mm_add_ps(fix_SSE2,tx_SSE2);
1134 fix_SSE3 = _mm_add_ps(fix_SSE3,tx_SSE3);
1135 fiy_SSE0 = _mm_add_ps(fiy_SSE0,ty_SSE0);
1136 fiy_SSE1 = _mm_add_ps(fiy_SSE1,ty_SSE1);
1137 fiy_SSE2 = _mm_add_ps(fiy_SSE2,ty_SSE2);
1138 fiy_SSE3 = _mm_add_ps(fiy_SSE3,ty_SSE3);
1139 fiz_SSE0 = _mm_add_ps(fiz_SSE0,tz_SSE0);
1140 fiz_SSE1 = _mm_add_ps(fiz_SSE1,tz_SSE1);
1141 fiz_SSE2 = _mm_add_ps(fiz_SSE2,tz_SSE2);
1142 fiz_SSE3 = _mm_add_ps(fiz_SSE3,tz_SSE3);
1144 /* Decrement j atom force */
1145 _mm_store_ps(fx_align+j,
1146 _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
1147 _mm_store_ps(fy_align+j,
1148 _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
1149 _mm_store_ps(fz_align+j,
1150 _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
1151 /* Inner loop uses 38 flops/iteration */
1154 /* Add i forces to mem and shifted force list */
1155 _MM_TRANSPOSE4_PS(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3);
1156 fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE1);
1157 fix_SSE2 = _mm_add_ps(fix_SSE2,fix_SSE3);
1158 fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE2);
1159 _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
1161 _MM_TRANSPOSE4_PS(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3);
1162 fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE1);
1163 fiy_SSE2 = _mm_add_ps(fiy_SSE2,fiy_SSE3);
1164 fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE2);
1165 _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
1167 _MM_TRANSPOSE4_PS(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3);
1168 fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE1);
1169 fiz_SSE2 = _mm_add_ps(fiz_SSE2,fiz_SSE3);
1170 fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE2);
1171 _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
1173 /* Add potential energies to the group for this list */
1176 _mm_storeu_ps(tmpsum,vctotSSE);
1177 Vc[ggid] = Vc[ggid] + tmpsum[0]+tmpsum[1]+tmpsum[2]+tmpsum[3];
1179 _mm_storeu_ps(tmpsum,VvdwtotSSE);
1180 Vvdw[ggid] = Vvdw[ggid] + tmpsum[0]+tmpsum[1]+tmpsum[2]+tmpsum[3];
1182 /* Outer loop uses 6 flops/iteration */
1185 for(i=ni0;i<ni1+1+natoms/2;i++)
1188 f[3*k] += fx_align[i];
1189 f[3*k+1] += fy_align[i];
1190 f[3*k+2] += fz_align[i];
1194 /* Write outer/inner iteration count to pointers */
1195 *outeriter = ni1-ni0;
1196 *inneriter = (ni1-ni0)*natoms/2;