2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team.
6 * Copyright (c) 2012,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
40 #include "gromacs/legacyheaders/types/simple.h"
42 #include "gromacs/math/vec.h"
43 #include "gromacs/utility/smalloc.h"
45 #include "gromacs/legacyheaders/network.h"
46 #include "gromacs/math/units.h"
47 #include "gromacs/legacyheaders/genborn.h"
48 #include "genborn_allvsall.h"
51 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
53 #include <gmx_sse2_double.h>
71 int ** prologue_mask_gb;
84 gmx_allvsallgb2_data_t;
88 calc_maxoffset(int i, int natoms)
92 if ((natoms % 2) == 1)
94 /* Odd number of atoms, easy */
97 else if ((natoms % 4) == 0)
99 /* Multiple of four is hard */
104 maxoffset = natoms/2;
108 maxoffset = natoms/2-1;
115 maxoffset = natoms/2;
119 maxoffset = natoms/2-1;
128 maxoffset = natoms/2;
132 maxoffset = natoms/2-1;
140 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
151 int ni0, ni1, nj0, nj1, nj;
152 int imin, imax, iexcl;
155 int firstinteraction;
159 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
160 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
161 * whether they should interact or not.
163 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
164 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
165 * we create a jindex array with three elements per i atom: the starting point, the point to
166 * which we need to check exclusions, and the end point.
167 * This way we only have to allocate a short exclusion mask per i atom.
170 ni0 = (start/UNROLLI)*UNROLLI;
171 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
173 /* Set the interaction mask to only enable the i atoms we want to include */
174 snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH));
175 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
176 for (i = 0; i < natoms+UNROLLI; i++)
178 aadata->imask[2*i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
179 aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
182 /* Allocate memory for our modified jindex array */
183 snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
184 for (i = 0; i < 4*(natoms+UNROLLI); i++)
186 aadata->jindex_gb[i] = 0;
189 /* Create the exclusion masks for the prologue part */
190 snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
192 /* First zero everything to avoid uninitialized data */
193 for (i = 0; i < natoms+UNROLLI; i++)
195 aadata->prologue_mask_gb[i] = NULL;
198 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
199 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
201 max_excl_offset = -1;
203 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
204 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
206 /* Which atom is the first we (might) interact with? */
207 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
208 for (i = ibase; i < imax; i++)
210 /* Before exclusions, which atom is the first we (might) interact with? */
211 firstinteraction = i+1;
212 max_offset = calc_maxoffset(i, natoms);
216 for (j = 0; j < ilist[F_GB12].nr; j += 3)
218 a1 = ilist[F_GB12].iatoms[j+1];
219 a2 = ilist[F_GB12].iatoms[j+2];
234 if (k == firstinteraction)
242 for (j = 0; j < ilist[F_GB13].nr; j += 3)
244 a1 = ilist[F_GB13].iatoms[j+1];
245 a2 = ilist[F_GB13].iatoms[j+2];
260 if (k == firstinteraction)
268 for (j = 0; j < ilist[F_GB14].nr; j += 3)
270 a1 = ilist[F_GB14].iatoms[j+1];
271 a2 = ilist[F_GB14].iatoms[j+2];
285 if (k == firstinteraction)
291 imin = (firstinteraction < imin) ? firstinteraction : imin;
293 /* round down to j unrolling factor */
294 imin = (imin/UNROLLJ)*UNROLLJ;
296 for (i = ibase; i < imax; i++)
298 max_offset = calc_maxoffset(i, natoms);
302 for (j = 0; j < ilist[F_GB12].nr; j += 3)
304 a1 = ilist[F_GB12].iatoms[j+1];
305 a2 = ilist[F_GB12].iatoms[j+2];
325 if (k > i+max_offset)
332 if (k+natoms <= max_offset)
336 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
341 for (j = 0; j < ilist[F_GB13].nr; j += 3)
343 a1 = ilist[F_GB13].iatoms[j+1];
344 a2 = ilist[F_GB13].iatoms[j+2];
364 if (k > i+max_offset)
371 if (k+natoms <= max_offset)
375 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
380 for (j = 0; j < ilist[F_GB14].nr; j += 3)
382 a1 = ilist[F_GB14].iatoms[j+1];
383 a2 = ilist[F_GB14].iatoms[j+2];
403 if (k > i+max_offset)
410 if (k+natoms <= max_offset)
414 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
419 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
421 /* round up to j unrolling factor */
422 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
424 /* Set all the prologue masks length to this value (even for i>end) */
425 for (i = ibase; i < ibase+UNROLLI; i++)
427 aadata->jindex_gb[4*i] = imin;
428 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
432 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
433 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
435 for (i = ibase; i < ibase+UNROLLI; i++)
437 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
438 imin = aadata->jindex_gb[4*i];
440 /* Allocate aligned memory */
441 snew(pi, 2*(nj+2*SIMD_WIDTH));
442 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
444 max_offset = calc_maxoffset(i, natoms);
446 /* Include interactions i+1 <= j < i+maxoffset */
447 for (k = 0; k < nj; k++)
451 if ( (j > i) && (j <= i+max_offset) )
453 aadata->prologue_mask_gb[i][2*k] = 0xFFFFFFFF;
454 aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
458 aadata->prologue_mask_gb[i][2*k] = 0;
459 aadata->prologue_mask_gb[i][2*k+1] = 0;
463 /* Clear out the explicit exclusions */
468 for (j = 0; j < ilist[F_GB12].nr; j += 3)
470 a1 = ilist[F_GB12].iatoms[j+1];
471 a2 = ilist[F_GB12].iatoms[j+2];
486 if (k > i+max_offset)
492 if (k+natoms <= max_offset)
500 aadata->prologue_mask_gb[i][2*k] = 0;
501 aadata->prologue_mask_gb[i][2*k+1] = 0;
507 for (j = 0; j < ilist[F_GB13].nr; j += 3)
509 a1 = ilist[F_GB13].iatoms[j+1];
510 a2 = ilist[F_GB13].iatoms[j+2];
525 if (k > i+max_offset)
531 if (k+natoms <= max_offset)
539 aadata->prologue_mask_gb[i][2*k] = 0;
540 aadata->prologue_mask_gb[i][2*k+1] = 0;
546 for (j = 0; j < ilist[F_GB14].nr; j += 3)
548 a1 = ilist[F_GB14].iatoms[j+1];
549 a2 = ilist[F_GB14].iatoms[j+2];
564 if (k > i+max_offset)
570 if (k+natoms <= max_offset)
578 aadata->prologue_mask_gb[i][2*k] = 0;
579 aadata->prologue_mask_gb[i][2*k+1] = 0;
587 /* Construct the epilogue mask - this just contains the check for maxoffset */
588 snew(aadata->epilogue_mask, natoms+UNROLLI);
590 /* First zero everything to avoid uninitialized data */
591 for (i = 0; i < natoms+UNROLLI; i++)
593 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
594 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
595 aadata->epilogue_mask[i] = NULL;
598 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
600 /* Find the lowest index for which we need to use the epilogue */
602 max_offset = calc_maxoffset(imin, natoms);
604 imin = imin + 1 + max_offset;
606 /* Find largest index for which we need to use the epilogue */
607 imax = ibase + UNROLLI-1;
608 imax = (imax < end) ? imax : end;
610 max_offset = calc_maxoffset(imax, natoms);
611 imax = imax + 1 + max_offset + UNROLLJ - 1;
613 for (i = ibase; i < ibase+UNROLLI; i++)
615 /* Start of epilogue - round down to j tile limit */
616 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
617 /* Make sure we dont overlap - for small systems everything is done in the prologue */
618 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
619 /* Round upwards to j tile limit */
620 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
621 /* Make sure we dont have a negative range for the epilogue */
622 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
626 /* And fill it with data... */
628 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
630 for (i = ibase; i < ibase+UNROLLI; i++)
633 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
635 /* Allocate aligned memory */
636 snew(pi, 2*(nj+2*SIMD_WIDTH));
637 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
639 max_offset = calc_maxoffset(i, natoms);
641 for (k = 0; k < nj; k++)
643 j = aadata->jindex_gb[4*i+2] + k;
644 aadata->epilogue_mask[i][2*k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
645 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
653 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
654 gmx_localtop_t * top,
655 gmx_genborn_t * born,
657 double radius_offset,
665 gmx_allvsallgb2_data_t *aadata;
668 natoms = mdatoms->nr;
673 snew(p, 2*natoms+2*SIMD_WIDTH);
674 aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
675 snew(p, 2*natoms+2*SIMD_WIDTH);
676 aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
677 snew(p, 2*natoms+2*SIMD_WIDTH);
678 aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
679 snew(p, 2*natoms+2*SIMD_WIDTH);
680 aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
681 snew(p, 2*natoms+2*SIMD_WIDTH);
682 aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
683 snew(p, 2*natoms+2*SIMD_WIDTH);
684 aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
686 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
687 aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
689 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
690 aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
692 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
693 aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
695 for (i = 0; i < mdatoms->nr; i++)
697 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
698 if (gb_algorithm == egbSTILL)
700 aadata->workparam[i] = born->vsolv[i];
702 else if (gb_algorithm == egbOBC)
704 aadata->workparam[i] = born->param[i];
706 aadata->work[i] = 0.0;
708 for (i = 0; i < mdatoms->nr; i++)
710 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
711 aadata->workparam[natoms+i] = aadata->workparam[i];
712 aadata->work[natoms+i] = aadata->work[i];
715 for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
717 aadata->x_align[i] = 0.0;
718 aadata->y_align[i] = 0.0;
719 aadata->z_align[i] = 0.0;
720 aadata->fx_align[i] = 0.0;
721 aadata->fy_align[i] = 0.0;
722 aadata->fz_align[i] = 0.0;
725 setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
726 bInclude12, bInclude13, bInclude14);
731 * This routine apparently hits a compiler bug visual studio has had 'forever'.
732 * It is present both in VS2005 and VS2008, and the only way around it is to
733 * decrease optimization. We do that with at pragma, and only for MSVC, so it
734 * will not hurt any of the well-behaving and supported compilers out there.
735 * MS: Fix your compiler, it sucks like a black hole!
738 #pragma optimize("t",off)
742 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr,
744 gmx_genborn_t * born,
745 gmx_localtop_t * top,
750 gmx_allvsallgb2_data_t *aadata;
753 int nj0, nj1, nj2, nj3;
764 double gpi, rai, vai;
766 double irsq, idr4, idr6;
767 double raj, rvdw, ratio;
768 double vaj, ccf, dccf, theta, cosq;
769 double term, prod, icf4, icf6, gpi2, factor, sinq;
780 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
781 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
782 __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
783 __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
784 __m128d imask_SSE0, jmask_SSE0;
785 __m128d imask_SSE1, jmask_SSE1;
786 __m128d jx_SSE, jy_SSE, jz_SSE;
787 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
788 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
789 __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
790 __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
791 __m128d raj_SSE, vaj_SSE, prod_SSE;
792 __m128d rvdw_SSE0, ratio_SSE0;
793 __m128d rvdw_SSE1, ratio_SSE1;
794 __m128d theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
795 __m128d theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
796 __m128d ccf_SSE0, dccf_SSE0;
797 __m128d ccf_SSE1, dccf_SSE1;
798 __m128d icf4_SSE0, icf6_SSE0;
799 __m128d icf4_SSE1, icf6_SSE1;
800 __m128d half_SSE, one_SSE, two_SSE, four_SSE;
801 __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
803 natoms = mdatoms->nr;
805 ni1 = mdatoms->homenr;
809 aadata = *((gmx_allvsallgb2_data_t **)paadata);
814 genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
815 egbSTILL, FALSE, FALSE, TRUE);
816 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
819 x_align = aadata->x_align;
820 y_align = aadata->y_align;
821 z_align = aadata->z_align;
823 gb_radius = aadata->gb_radius;
824 vsolv = aadata->workparam;
826 jindex = aadata->jindex_gb;
829 still_p4_SSE = _mm_set1_pd(STILL_P4);
830 still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
831 still_pip5_SSE = _mm_set1_pd(STILL_PIP5);
832 half_SSE = _mm_set1_pd(0.5);
833 one_SSE = _mm_set1_pd(1.0);
834 two_SSE = _mm_set1_pd(2.0);
835 four_SSE = _mm_set1_pd(4.0);
837 /* This will be summed, so it has to extend to natoms + buffer */
838 for (i = 0; i < natoms+1+natoms/2; i++)
843 for (i = ni0; i < ni1+1+natoms/2; i++)
847 y_align[i] = x[3*k+1];
848 z_align[i] = x[3*k+2];
852 for (i = ni0; i < ni1; i += UNROLLI)
854 /* We assume shifts are NOT used for all-vs-all interactions */
855 /* Load i atom data */
856 ix_SSE0 = _mm_load1_pd(x_align+i);
857 iy_SSE0 = _mm_load1_pd(y_align+i);
858 iz_SSE0 = _mm_load1_pd(z_align+i);
859 ix_SSE1 = _mm_load1_pd(x_align+i+1);
860 iy_SSE1 = _mm_load1_pd(y_align+i+1);
861 iz_SSE1 = _mm_load1_pd(z_align+i+1);
863 gpi_SSE0 = _mm_setzero_pd();
864 gpi_SSE1 = _mm_setzero_pd();
866 rai_SSE0 = _mm_load1_pd(gb_radius+i);
867 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
869 prod_ai_SSE0 = _mm_set1_pd(STILL_P4*vsolv[i]);
870 prod_ai_SSE1 = _mm_set1_pd(STILL_P4*vsolv[i+1]);
872 /* Load limits for loop over neighbors */
878 pmask0 = aadata->prologue_mask_gb[i];
879 pmask1 = aadata->prologue_mask_gb[i+1];
880 emask0 = aadata->epilogue_mask[i];
881 emask1 = aadata->epilogue_mask[i+1];
883 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
884 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
886 /* Prologue part, including exclusion mask */
887 for (j = nj0; j < nj1; j += UNROLLJ)
889 jmask_SSE0 = _mm_load_pd((double *)pmask0);
890 jmask_SSE1 = _mm_load_pd((double *)pmask1);
894 /* load j atom coordinates */
895 jx_SSE = _mm_load_pd(x_align+j);
896 jy_SSE = _mm_load_pd(y_align+j);
897 jz_SSE = _mm_load_pd(z_align+j);
899 /* Calculate distance */
900 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
901 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
902 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
903 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
904 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
905 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
907 /* rsq = dx*dx+dy*dy+dz*dz */
908 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
909 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
912 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
913 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
915 /* Calculate 1/r and 1/r2 */
916 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
917 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
920 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
921 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
923 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
924 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
925 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
926 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
927 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
928 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
930 raj_SSE = _mm_load_pd(gb_radius+j);
931 vaj_SSE = _mm_load_pd(vsolv+j);
933 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
934 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
936 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
937 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
939 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
940 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
941 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
942 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
943 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
944 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
945 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
946 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
947 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
948 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
949 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
950 _mm_mul_pd(sinq_SSE0, theta_SSE0));
951 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
952 _mm_mul_pd(sinq_SSE1, theta_SSE1));
954 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
955 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
956 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
957 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
958 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
960 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
961 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
962 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
965 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
966 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
968 /* Save ai->aj and aj->ai chain rule terms */
969 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
971 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
974 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
976 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
980 /* Main part, no exclusions */
981 for (j = nj1; j < nj2; j += UNROLLJ)
984 /* load j atom coordinates */
985 jx_SSE = _mm_load_pd(x_align+j);
986 jy_SSE = _mm_load_pd(y_align+j);
987 jz_SSE = _mm_load_pd(z_align+j);
989 /* Calculate distance */
990 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
991 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
992 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
993 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
994 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
995 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
997 /* rsq = dx*dx+dy*dy+dz*dz */
998 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
999 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1001 /* Calculate 1/r and 1/r2 */
1002 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1003 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1006 rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
1007 rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
1009 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1010 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1011 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1012 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1013 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1014 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1016 raj_SSE = _mm_load_pd(gb_radius+j);
1018 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
1019 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
1020 vaj_SSE = _mm_load_pd(vsolv+j);
1022 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1023 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1025 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1026 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1027 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1028 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1029 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1030 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1031 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1032 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1033 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
1034 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
1035 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1036 _mm_mul_pd(sinq_SSE0, theta_SSE0));
1037 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1038 _mm_mul_pd(sinq_SSE1, theta_SSE1));
1040 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE );
1041 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1042 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1043 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1044 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1046 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1047 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1048 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1050 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1051 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1053 /* Save ai->aj and aj->ai chain rule terms */
1054 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1056 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1059 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1061 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1064 /* Epilogue part, including exclusion mask */
1065 for (j = nj2; j < nj3; j += UNROLLJ)
1067 jmask_SSE0 = _mm_load_pd((double *)emask0);
1068 jmask_SSE1 = _mm_load_pd((double *)emask1);
1069 emask0 += 2*UNROLLJ;
1070 emask1 += 2*UNROLLJ;
1072 /* load j atom coordinates */
1073 jx_SSE = _mm_load_pd(x_align+j);
1074 jy_SSE = _mm_load_pd(y_align+j);
1075 jz_SSE = _mm_load_pd(z_align+j);
1077 /* Calculate distance */
1078 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1079 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1080 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1081 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1082 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1083 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1085 /* rsq = dx*dx+dy*dy+dz*dz */
1086 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1087 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1090 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1091 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1093 /* Calculate 1/r and 1/r2 */
1094 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1095 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1098 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1099 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1101 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1102 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1103 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1104 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1105 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1106 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1108 raj_SSE = _mm_load_pd(gb_radius+j);
1109 vaj_SSE = _mm_load_pd(vsolv+j);
1111 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
1112 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
1114 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1115 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1117 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1118 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1119 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1120 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1121 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1122 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1123 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1124 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1125 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
1126 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
1127 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1128 _mm_mul_pd(sinq_SSE0, theta_SSE0));
1129 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1130 _mm_mul_pd(sinq_SSE1, theta_SSE1));
1132 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
1133 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1134 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1135 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1136 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1138 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1139 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1140 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1142 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1143 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1145 /* Save ai->aj and aj->ai chain rule terms */
1146 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1148 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1151 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1153 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1156 GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1);
1157 gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1);
1158 _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1161 /* In case we have written anything beyond natoms, move it back.
1162 * Never mind that we leave stuff above natoms; that will not
1163 * be accessed later in the routine.
1164 * In principle this should be a move rather than sum, but this
1165 * way we dont have to worry about even/odd offsets...
1167 for (i = natoms; i < ni1+1+natoms/2; i++)
1169 work[i-natoms] += work[i];
1172 /* Parallel summations would go here if ever implemented with DD */
1174 factor = 0.5 * ONE_4PI_EPS0;
1175 /* Calculate the radii - should we do all atoms, or just our local ones? */
1176 for (i = 0; i < natoms; i++)
1178 if (born->use[i] != 0)
1180 gpi = born->gpol[i]+work[i];
1182 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1183 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1189 /* Reinstate MSVC optimization */
1191 #pragma optimize("",on)
1196 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr,
1197 t_mdatoms * mdatoms,
1198 gmx_genborn_t * born,
1200 gmx_localtop_t * top,
1205 gmx_allvsallgb2_data_t *aadata;
1208 int nj0, nj1, nj2, nj3;
1225 double rad, min_rad;
1226 double rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1228 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
1229 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
1230 __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1231 __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1232 __m128d imask_SSE0, jmask_SSE0;
1233 __m128d imask_SSE1, jmask_SSE1;
1234 __m128d jx_SSE, jy_SSE, jz_SSE;
1235 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
1236 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
1237 __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1238 __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1239 __m128d raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1240 __m128d ccf_SSE0, dccf_SSE0, prod_SSE0;
1241 __m128d ccf_SSE1, dccf_SSE1, prod_SSE1;
1242 __m128d icf4_SSE0, icf6_SSE0;
1243 __m128d icf4_SSE1, icf6_SSE1;
1244 __m128d oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1245 __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1246 __m128d rai_inv_SSE0;
1247 __m128d rai_inv_SSE1;
1248 __m128d sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1249 __m128d sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1250 __m128d lij_inv_SSE0, sk2_rinv_SSE0;
1251 __m128d lij_inv_SSE1, sk2_rinv_SSE1;
1254 __m128d t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1255 __m128d t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1256 __m128d obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1257 __m128d obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1258 __m128d uij_SSE0, uij2_SSE0, uij3_SSE0;
1259 __m128d uij_SSE1, uij2_SSE1, uij3_SSE1;
1260 __m128d lij_SSE0, lij2_SSE0, lij3_SSE0;
1261 __m128d lij_SSE1, lij2_SSE1, lij3_SSE1;
1262 __m128d dlij_SSE0, diff2_SSE0, logterm_SSE0;
1263 __m128d dlij_SSE1, diff2_SSE1, logterm_SSE1;
1264 __m128d doffset_SSE, tmpSSE;
1266 natoms = mdatoms->nr;
1268 ni1 = mdatoms->homenr;
1272 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1277 genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1278 egbOBC, TRUE, TRUE, TRUE);
1279 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1282 x_align = aadata->x_align;
1283 y_align = aadata->y_align;
1284 z_align = aadata->z_align;
1286 gb_radius = aadata->gb_radius;
1287 work = aadata->work;
1288 jindex = aadata->jindex_gb;
1290 obc_param = aadata->workparam;
1292 oneeighth_SSE = _mm_set1_pd(0.125);
1293 onefourth_SSE = _mm_set1_pd(0.25);
1294 half_SSE = _mm_set1_pd(0.5);
1295 one_SSE = _mm_set1_pd(1.0);
1296 two_SSE = _mm_set1_pd(2.0);
1297 four_SSE = _mm_set1_pd(4.0);
1298 doffset_SSE = _mm_set1_pd(born->gb_doffset);
1300 for (i = 0; i < natoms; i++)
1302 x_align[i] = x[3*i];
1303 y_align[i] = x[3*i+1];
1304 z_align[i] = x[3*i+2];
1308 for (i = 0; i < natoms/2+1; i++)
1310 x_align[natoms+i] = x_align[i];
1311 y_align[natoms+i] = y_align[i];
1312 z_align[natoms+i] = z_align[i];
1315 for (i = 0; i < natoms+natoms/2+1; i++)
1320 for (i = ni0; i < ni1; i += UNROLLI)
1322 /* We assume shifts are NOT used for all-vs-all interactions */
1324 /* Load i atom data */
1325 ix_SSE0 = _mm_load1_pd(x_align+i);
1326 iy_SSE0 = _mm_load1_pd(y_align+i);
1327 iz_SSE0 = _mm_load1_pd(z_align+i);
1328 ix_SSE1 = _mm_load1_pd(x_align+i+1);
1329 iy_SSE1 = _mm_load1_pd(y_align+i+1);
1330 iz_SSE1 = _mm_load1_pd(z_align+i+1);
1332 rai_SSE0 = _mm_load1_pd(gb_radius+i);
1333 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
1334 rai_inv_SSE0 = gmx_mm_inv_pd(rai_SSE0);
1335 rai_inv_SSE1 = gmx_mm_inv_pd(rai_SSE1);
1337 sk_ai_SSE0 = _mm_load1_pd(obc_param+i);
1338 sk_ai_SSE1 = _mm_load1_pd(obc_param+i+1);
1339 sk2_ai_SSE0 = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0);
1340 sk2_ai_SSE1 = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1);
1342 sum_ai_SSE0 = _mm_setzero_pd();
1343 sum_ai_SSE1 = _mm_setzero_pd();
1345 /* Load limits for loop over neighbors */
1347 nj1 = jindex[4*i+1];
1348 nj2 = jindex[4*i+2];
1349 nj3 = jindex[4*i+3];
1351 pmask0 = aadata->prologue_mask_gb[i];
1352 pmask1 = aadata->prologue_mask_gb[i+1];
1353 emask0 = aadata->epilogue_mask[i];
1354 emask1 = aadata->epilogue_mask[i+1];
1356 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
1357 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1359 /* Prologue part, including exclusion mask */
1360 for (j = nj0; j < nj1; j += UNROLLJ)
1362 jmask_SSE0 = _mm_load_pd((double *)pmask0);
1363 jmask_SSE1 = _mm_load_pd((double *)pmask1);
1364 pmask0 += 2*UNROLLJ;
1365 pmask1 += 2*UNROLLJ;
1367 /* load j atom coordinates */
1368 jx_SSE = _mm_load_pd(x_align+j);
1369 jy_SSE = _mm_load_pd(y_align+j);
1370 jz_SSE = _mm_load_pd(z_align+j);
1372 /* Calculate distance */
1373 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1374 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1375 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1376 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1377 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1378 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1380 /* rsq = dx*dx+dy*dy+dz*dz */
1381 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1382 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1385 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1386 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1388 /* Calculate 1/r and 1/r2 */
1389 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1390 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1393 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1394 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1396 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1397 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1399 sk_aj_SSE = _mm_load_pd(obc_param+j);
1400 raj_SSE = _mm_load_pd(gb_radius+j);
1401 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1403 /* Evaluate influence of atom aj -> ai */
1404 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1405 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1406 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1407 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1408 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1409 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1411 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1412 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1413 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1414 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1415 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1416 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1417 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1418 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1420 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1421 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1422 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1423 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1424 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1425 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1426 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1427 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1429 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1430 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1431 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1432 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1433 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1434 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1435 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1436 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1438 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1439 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1440 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1441 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1442 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1443 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1444 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1445 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1446 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1448 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1449 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1451 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1452 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1453 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1454 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1456 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1457 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1460 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1461 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1462 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1463 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1464 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1465 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1466 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1467 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1468 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1469 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1471 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1472 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1474 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1475 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1476 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1477 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1478 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1479 _mm_mul_pd(onefourth_SSE,
1480 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1481 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1482 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1483 _mm_mul_pd(onefourth_SSE,
1484 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1485 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1487 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1488 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1489 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1490 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1491 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1492 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1493 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1494 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1495 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1496 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1497 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1498 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1499 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1500 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1501 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1502 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1503 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1504 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1506 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1507 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1508 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1510 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1512 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1513 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1514 _mm_add_pd(t2_SSE0, t3_SSE0)));
1515 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1516 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1517 _mm_add_pd(t2_SSE1, t3_SSE1)));
1519 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1521 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1524 /* Evaluate influence of atom ai -> aj */
1525 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1526 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1527 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1528 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1529 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1530 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1532 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1533 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1534 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1535 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1536 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1537 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1538 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1539 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1541 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1542 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1543 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1544 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1545 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1546 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1547 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1548 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1550 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1551 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1552 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1553 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1554 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1555 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1556 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1557 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1559 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1560 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1561 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1562 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1563 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1564 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1565 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1566 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1568 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1569 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1570 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1571 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1572 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1573 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1575 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1576 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1578 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1579 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1580 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1581 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1582 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1583 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1584 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1585 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1586 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1587 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1589 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1590 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1591 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1593 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1594 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1595 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1596 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1597 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1598 _mm_mul_pd(onefourth_SSE,
1599 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1600 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1601 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1602 _mm_mul_pd(onefourth_SSE,
1603 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1604 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1605 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1606 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1607 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1608 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1609 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1610 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1611 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1612 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1613 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1614 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1615 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1616 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1618 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1619 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1620 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1621 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1623 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1624 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1626 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1627 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1628 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1630 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1633 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1634 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1635 _mm_add_pd(t2_SSE0, t3_SSE0)));
1636 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1637 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1638 _mm_add_pd(t2_SSE1, t3_SSE1)));
1640 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1642 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1646 /* Main part, no exclusions */
1647 for (j = nj1; j < nj2; j += UNROLLJ)
1649 /* load j atom coordinates */
1650 jx_SSE = _mm_load_pd(x_align+j);
1651 jy_SSE = _mm_load_pd(y_align+j);
1652 jz_SSE = _mm_load_pd(z_align+j);
1654 /* Calculate distance */
1655 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1656 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1657 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1658 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1659 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1660 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1662 /* rsq = dx*dx+dy*dy+dz*dz */
1663 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1664 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1666 /* Calculate 1/r and 1/r2 */
1667 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1668 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1671 rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
1672 rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
1674 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1675 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1677 sk_aj_SSE = _mm_load_pd(obc_param+j);
1678 raj_SSE = _mm_load_pd(gb_radius+j);
1680 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1682 /* Evaluate influence of atom aj -> ai */
1683 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1684 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1685 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1686 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1687 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1688 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1690 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1691 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1692 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1693 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1694 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1695 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1696 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1697 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1699 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1700 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1701 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1702 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1703 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1704 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1705 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1706 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1708 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1709 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1710 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1711 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1712 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1713 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1714 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1715 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1717 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1718 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1719 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1720 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1721 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1722 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1723 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1724 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1725 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1727 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1728 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1730 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1731 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1732 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1733 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1735 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1736 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1739 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1740 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1741 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1742 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1743 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1744 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1745 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1746 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1747 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1748 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1750 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1751 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1753 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1754 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1755 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1756 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1758 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1759 _mm_mul_pd(onefourth_SSE,
1760 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1761 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1762 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1763 _mm_mul_pd(onefourth_SSE,
1764 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1765 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1767 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1768 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1769 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1770 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1771 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1772 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1773 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1774 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1775 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1776 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1777 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1778 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1779 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1780 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1781 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1782 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1783 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1784 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1786 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1787 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1788 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1790 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1792 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1793 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1794 _mm_add_pd(t2_SSE0, t3_SSE0)));
1795 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1796 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1797 _mm_add_pd(t2_SSE1, t3_SSE1)));
1799 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1801 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1804 /* Evaluate influence of atom ai -> aj */
1805 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1806 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1807 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1808 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1809 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1810 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1812 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1813 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1814 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1815 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1816 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1817 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1818 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1819 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1821 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1822 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1823 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1824 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1825 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1826 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1827 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1828 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1830 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1831 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1832 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1833 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1834 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1835 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1836 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1837 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1839 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1840 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1841 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1842 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1843 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1844 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1845 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1846 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1848 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1849 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1850 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1851 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1852 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1853 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1855 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1856 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1858 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1859 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1860 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1861 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1862 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1863 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1864 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1865 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1866 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1867 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1869 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1870 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1871 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1873 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1874 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1875 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1876 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1877 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1878 _mm_mul_pd(onefourth_SSE,
1879 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1880 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1881 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1882 _mm_mul_pd(onefourth_SSE,
1883 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1884 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1885 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1886 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1887 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1888 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1889 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1890 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1891 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1892 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1893 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1894 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1895 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1896 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1898 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1899 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1900 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1901 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1903 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1904 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1906 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1907 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1908 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1910 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1912 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1913 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1914 _mm_add_pd(t2_SSE0, t3_SSE0)));
1915 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1916 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1917 _mm_add_pd(t2_SSE1, t3_SSE1)));
1919 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1921 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1925 /* Epilogue part, including exclusion mask */
1926 for (j = nj2; j < nj3; j += UNROLLJ)
1928 jmask_SSE0 = _mm_load_pd((double *)emask0);
1929 jmask_SSE1 = _mm_load_pd((double *)emask1);
1930 emask0 += 2*UNROLLJ;
1931 emask1 += 2*UNROLLJ;
1933 /* load j atom coordinates */
1934 jx_SSE = _mm_load_pd(x_align+j);
1935 jy_SSE = _mm_load_pd(y_align+j);
1936 jz_SSE = _mm_load_pd(z_align+j);
1938 /* Calculate distance */
1939 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1940 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1941 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1942 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1943 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1944 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1946 /* rsq = dx*dx+dy*dy+dz*dz */
1947 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1948 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1951 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1952 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1954 /* Calculate 1/r and 1/r2 */
1955 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1956 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1959 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1960 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1962 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1963 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1965 sk_aj_SSE = _mm_load_pd(obc_param+j);
1966 raj_SSE = _mm_load_pd(gb_radius+j);
1968 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1970 /* Evaluate influence of atom aj -> ai */
1971 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1972 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1973 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1974 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1975 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1976 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1978 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1979 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1980 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1981 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1982 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1983 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1984 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1985 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1987 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1988 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1989 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1990 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1991 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1992 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1994 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1995 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1997 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1998 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1999 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2000 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2001 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2002 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2003 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2004 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2006 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2007 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2008 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2009 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2010 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
2011 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
2012 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
2013 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2014 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2016 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2017 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2019 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
2020 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
2021 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2022 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2024 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2025 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2028 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2029 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2030 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2031 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2032 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
2033 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
2034 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2035 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2036 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2037 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2039 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2040 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2042 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2043 _mm_mul_pd(prod_SSE0, lij3_SSE0));
2044 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2045 _mm_mul_pd(prod_SSE1, lij3_SSE1));
2046 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2047 _mm_mul_pd(onefourth_SSE,
2048 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2049 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2050 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2051 _mm_mul_pd(onefourth_SSE,
2052 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2053 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2055 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2056 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2057 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2058 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2059 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2060 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2061 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2062 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2063 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2064 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2065 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2066 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2067 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2068 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2069 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2070 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2071 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2072 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2074 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2075 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2076 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2078 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2080 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2081 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2082 _mm_add_pd(t2_SSE0, t3_SSE0)));
2083 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2084 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2085 _mm_add_pd(t2_SSE1, t3_SSE1)));
2087 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2089 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2092 /* Evaluate influence of atom ai -> aj */
2093 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
2094 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
2095 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
2096 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
2097 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
2098 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
2100 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2101 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2102 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2103 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2104 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2105 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2106 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
2107 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
2109 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
2110 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
2111 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
2112 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
2113 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
2114 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
2116 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
2117 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
2119 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
2120 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
2121 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2122 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2123 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2124 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2125 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2126 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2128 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2129 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2130 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2131 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2132 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
2133 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
2134 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2135 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2137 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2138 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2139 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
2140 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
2141 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2142 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2144 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2145 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2147 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2148 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2149 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2150 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2151 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
2152 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
2153 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2154 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2155 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2156 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2158 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2159 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
2160 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
2162 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2163 _mm_mul_pd(prod_SSE0, lij3_SSE0));
2164 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2165 _mm_mul_pd(prod_SSE1, lij3_SSE1));
2167 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2168 _mm_mul_pd(onefourth_SSE,
2169 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2170 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2171 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2172 _mm_mul_pd(onefourth_SSE,
2173 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2174 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2175 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2176 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2177 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2178 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2179 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2180 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2181 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2182 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2183 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2184 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2185 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2186 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2188 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2189 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2190 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2191 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2193 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2194 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2196 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2197 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2198 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2200 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2202 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2203 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2204 _mm_add_pd(t2_SSE0, t3_SSE0)));
2205 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2206 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2207 _mm_add_pd(t2_SSE1, t3_SSE1)));
2209 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2211 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2214 GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1);
2215 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1);
2216 _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2220 for (i = 0; i < natoms/2+1; i++)
2222 work[i] += work[natoms+i];
2225 /* Parallel summations would go here if ever implemented in DD */
2227 if (gb_algorithm == egbHCT)
2230 for (i = 0; i < natoms; i++)
2232 if (born->use[i] != 0)
2234 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
2235 sum_ai = 1.0/rai - work[i];
2236 min_rad = rai + born->gb_doffset;
2239 born->bRad[i] = rad > min_rad ? rad : min_rad;
2240 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2249 /* Calculate the radii */
2250 for (i = 0; i < natoms; i++)
2253 if (born->use[i] != 0)
2255 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2257 rai = rai-born->gb_doffset;
2259 sum_ai = rai * work[i];
2260 sum_ai2 = sum_ai * sum_ai;
2261 sum_ai3 = sum_ai2 * sum_ai;
2263 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2264 born->bRad[i] = rai_inv - tsum*rai_inv2;
2265 born->bRad[i] = 1.0 / born->bRad[i];
2267 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2269 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2270 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2286 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr,
2287 t_mdatoms * mdatoms,
2288 gmx_genborn_t * born,
2294 gmx_allvsallgb2_data_t *aadata;
2297 int nj0, nj1, nj2, nj3;
2306 double fix, fiy, fiz;
2310 double rbai, rbaj, fgb, fgb_ai, rbi;
2321 __m128d jmask_SSE0, jmask_SSE1;
2322 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
2323 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
2324 __m128d fix_SSE0, fiy_SSE0, fiz_SSE0;
2325 __m128d fix_SSE1, fiy_SSE1, fiz_SSE1;
2326 __m128d rbai_SSE0, rbai_SSE1;
2327 __m128d imask_SSE0, imask_SSE1;
2328 __m128d jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
2329 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
2330 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
2331 __m128d fgb_SSE0, fgb_ai_SSE0;
2332 __m128d fgb_SSE1, fgb_ai_SSE1;
2333 __m128d tx_SSE0, ty_SSE0, tz_SSE0;
2334 __m128d tx_SSE1, ty_SSE1, tz_SSE1;
2335 __m128d t1, t2, tmpSSE;
2337 natoms = mdatoms->nr;
2339 ni1 = mdatoms->homenr;
2341 aadata = (gmx_allvsallgb2_data_t *)paadata;
2343 x_align = aadata->x_align;
2344 y_align = aadata->y_align;
2345 z_align = aadata->z_align;
2346 fx_align = aadata->fx_align;
2347 fy_align = aadata->fy_align;
2348 fz_align = aadata->fz_align;
2350 jindex = aadata->jindex_gb;
2356 /* Loop to get the proper form for the Born radius term */
2357 if (gb_algorithm == egbSTILL)
2359 for (i = 0; i < natoms; i++)
2361 rbi = born->bRad[i];
2362 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2365 else if (gb_algorithm == egbHCT)
2367 for (i = 0; i < natoms; i++)
2369 rbi = born->bRad[i];
2370 rb[i] = rbi * rbi * fr->dvda[i];
2373 else if (gb_algorithm == egbOBC)
2375 for (idx = 0; idx < natoms; idx++)
2377 rbi = born->bRad[idx];
2378 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2382 for (i = 0; i < 2*natoms; i++)
2390 for (i = 0; i < natoms; i++)
2392 rb[i+natoms] = rb[i];
2395 for (i = ni0; i < ni1; i += UNROLLI)
2397 /* We assume shifts are NOT used for all-vs-all interactions */
2399 /* Load i atom data */
2400 ix_SSE0 = _mm_load1_pd(x_align+i);
2401 iy_SSE0 = _mm_load1_pd(y_align+i);
2402 iz_SSE0 = _mm_load1_pd(z_align+i);
2403 ix_SSE1 = _mm_load1_pd(x_align+i+1);
2404 iy_SSE1 = _mm_load1_pd(y_align+i+1);
2405 iz_SSE1 = _mm_load1_pd(z_align+i+1);
2407 fix_SSE0 = _mm_setzero_pd();
2408 fiy_SSE0 = _mm_setzero_pd();
2409 fiz_SSE0 = _mm_setzero_pd();
2410 fix_SSE1 = _mm_setzero_pd();
2411 fiy_SSE1 = _mm_setzero_pd();
2412 fiz_SSE1 = _mm_setzero_pd();
2414 rbai_SSE0 = _mm_load1_pd(rb+i);
2415 rbai_SSE1 = _mm_load1_pd(rb+i+1);
2417 /* Load limits for loop over neighbors */
2419 nj3 = jindex[4*i+3];
2421 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2422 for (j = nj0; j < nj3; j += UNROLLJ)
2424 /* load j atom coordinates */
2425 jx_SSE = _mm_load_pd(x_align+j);
2426 jy_SSE = _mm_load_pd(y_align+j);
2427 jz_SSE = _mm_load_pd(z_align+j);
2429 /* Calculate distance */
2430 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
2431 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
2432 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
2433 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
2434 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
2435 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
2437 rbaj_SSE = _mm_load_pd(rb+j);
2439 fgb_SSE0 = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx));
2441 fgb_SSE1 = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx));
2444 fgb_ai_SSE0 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2446 fgb_ai_SSE1 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2449 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2450 fgb_SSE0 = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0);
2451 fgb_SSE1 = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1);
2453 /* Calculate temporary vectorial force */
2454 tx_SSE0 = _mm_mul_pd(fgb_SSE0, dx_SSE0);
2455 ty_SSE0 = _mm_mul_pd(fgb_SSE0, dy_SSE0);
2456 tz_SSE0 = _mm_mul_pd(fgb_SSE0, dz_SSE0);
2457 tx_SSE1 = _mm_mul_pd(fgb_SSE1, dx_SSE1);
2458 ty_SSE1 = _mm_mul_pd(fgb_SSE1, dy_SSE1);
2459 tz_SSE1 = _mm_mul_pd(fgb_SSE1, dz_SSE1);
2461 /* Increment i atom force */
2462 fix_SSE0 = _mm_add_pd(fix_SSE0, tx_SSE0);
2463 fiy_SSE0 = _mm_add_pd(fiy_SSE0, ty_SSE0);
2464 fiz_SSE0 = _mm_add_pd(fiz_SSE0, tz_SSE0);
2465 fix_SSE1 = _mm_add_pd(fix_SSE1, tx_SSE1);
2466 fiy_SSE1 = _mm_add_pd(fiy_SSE1, ty_SSE1);
2467 fiz_SSE1 = _mm_add_pd(fiz_SSE1, tz_SSE1);
2469 /* Decrement j atom force */
2470 _mm_store_pd(fx_align+j,
2471 _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) ));
2472 _mm_store_pd(fy_align+j,
2473 _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) ));
2474 _mm_store_pd(fz_align+j,
2475 _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) ));
2478 /* Add i forces to mem */
2479 GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1);
2480 fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1);
2481 _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2483 GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1);
2484 fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1);
2485 _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2487 GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1);
2488 fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1);
2489 _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
2492 for (i = 0; i < natoms; i++)
2494 f[3*i] += fx_align[i] + fx_align[natoms+i];
2495 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
2496 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
2503 /* dummy variable when not using SSE */
2504 int genborn_allvsall_sse2_double_dummy;