2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team.
6 * Copyright (c) 2012,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
41 #include "gromacs/legacyheaders/genborn.h"
42 #include "gromacs/legacyheaders/network.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/units.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/genborn_allvsall.h"
47 #include "gromacs/utility/smalloc.h"
50 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
52 #include <gmx_sse2_double.h>
70 int ** prologue_mask_gb;
83 gmx_allvsallgb2_data_t;
87 calc_maxoffset(int i, int natoms)
91 if ((natoms % 2) == 1)
93 /* Odd number of atoms, easy */
96 else if ((natoms % 4) == 0)
98 /* Multiple of four is hard */
103 maxoffset = natoms/2;
107 maxoffset = natoms/2-1;
114 maxoffset = natoms/2;
118 maxoffset = natoms/2-1;
127 maxoffset = natoms/2;
131 maxoffset = natoms/2-1;
139 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
150 int ni0, ni1, nj0, nj1, nj;
151 int imin, imax, iexcl;
154 int firstinteraction;
158 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
159 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
160 * whether they should interact or not.
162 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
163 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
164 * we create a jindex array with three elements per i atom: the starting point, the point to
165 * which we need to check exclusions, and the end point.
166 * This way we only have to allocate a short exclusion mask per i atom.
169 ni0 = (start/UNROLLI)*UNROLLI;
170 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
172 /* Set the interaction mask to only enable the i atoms we want to include */
173 snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH));
174 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
175 for (i = 0; i < natoms+UNROLLI; i++)
177 aadata->imask[2*i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
178 aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
181 /* Allocate memory for our modified jindex array */
182 snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
183 for (i = 0; i < 4*(natoms+UNROLLI); i++)
185 aadata->jindex_gb[i] = 0;
188 /* Create the exclusion masks for the prologue part */
189 snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
191 /* First zero everything to avoid uninitialized data */
192 for (i = 0; i < natoms+UNROLLI; i++)
194 aadata->prologue_mask_gb[i] = NULL;
197 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
198 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
200 max_excl_offset = -1;
202 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
203 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
205 /* Which atom is the first we (might) interact with? */
206 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
207 for (i = ibase; i < imax; i++)
209 /* Before exclusions, which atom is the first we (might) interact with? */
210 firstinteraction = i+1;
211 max_offset = calc_maxoffset(i, natoms);
215 for (j = 0; j < ilist[F_GB12].nr; j += 3)
217 a1 = ilist[F_GB12].iatoms[j+1];
218 a2 = ilist[F_GB12].iatoms[j+2];
233 if (k == firstinteraction)
241 for (j = 0; j < ilist[F_GB13].nr; j += 3)
243 a1 = ilist[F_GB13].iatoms[j+1];
244 a2 = ilist[F_GB13].iatoms[j+2];
259 if (k == firstinteraction)
267 for (j = 0; j < ilist[F_GB14].nr; j += 3)
269 a1 = ilist[F_GB14].iatoms[j+1];
270 a2 = ilist[F_GB14].iatoms[j+2];
284 if (k == firstinteraction)
290 imin = (firstinteraction < imin) ? firstinteraction : imin;
292 /* round down to j unrolling factor */
293 imin = (imin/UNROLLJ)*UNROLLJ;
295 for (i = ibase; i < imax; i++)
297 max_offset = calc_maxoffset(i, natoms);
301 for (j = 0; j < ilist[F_GB12].nr; j += 3)
303 a1 = ilist[F_GB12].iatoms[j+1];
304 a2 = ilist[F_GB12].iatoms[j+2];
324 if (k > i+max_offset)
331 if (k+natoms <= max_offset)
335 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
340 for (j = 0; j < ilist[F_GB13].nr; j += 3)
342 a1 = ilist[F_GB13].iatoms[j+1];
343 a2 = ilist[F_GB13].iatoms[j+2];
363 if (k > i+max_offset)
370 if (k+natoms <= max_offset)
374 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
379 for (j = 0; j < ilist[F_GB14].nr; j += 3)
381 a1 = ilist[F_GB14].iatoms[j+1];
382 a2 = ilist[F_GB14].iatoms[j+2];
402 if (k > i+max_offset)
409 if (k+natoms <= max_offset)
413 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
418 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
420 /* round up to j unrolling factor */
421 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
423 /* Set all the prologue masks length to this value (even for i>end) */
424 for (i = ibase; i < ibase+UNROLLI; i++)
426 aadata->jindex_gb[4*i] = imin;
427 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
431 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
432 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
434 for (i = ibase; i < ibase+UNROLLI; i++)
436 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
437 imin = aadata->jindex_gb[4*i];
439 /* Allocate aligned memory */
440 snew(pi, 2*(nj+2*SIMD_WIDTH));
441 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
443 max_offset = calc_maxoffset(i, natoms);
445 /* Include interactions i+1 <= j < i+maxoffset */
446 for (k = 0; k < nj; k++)
450 if ( (j > i) && (j <= i+max_offset) )
452 aadata->prologue_mask_gb[i][2*k] = 0xFFFFFFFF;
453 aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
457 aadata->prologue_mask_gb[i][2*k] = 0;
458 aadata->prologue_mask_gb[i][2*k+1] = 0;
462 /* Clear out the explicit exclusions */
467 for (j = 0; j < ilist[F_GB12].nr; j += 3)
469 a1 = ilist[F_GB12].iatoms[j+1];
470 a2 = ilist[F_GB12].iatoms[j+2];
485 if (k > i+max_offset)
491 if (k+natoms <= max_offset)
499 aadata->prologue_mask_gb[i][2*k] = 0;
500 aadata->prologue_mask_gb[i][2*k+1] = 0;
506 for (j = 0; j < ilist[F_GB13].nr; j += 3)
508 a1 = ilist[F_GB13].iatoms[j+1];
509 a2 = ilist[F_GB13].iatoms[j+2];
524 if (k > i+max_offset)
530 if (k+natoms <= max_offset)
538 aadata->prologue_mask_gb[i][2*k] = 0;
539 aadata->prologue_mask_gb[i][2*k+1] = 0;
545 for (j = 0; j < ilist[F_GB14].nr; j += 3)
547 a1 = ilist[F_GB14].iatoms[j+1];
548 a2 = ilist[F_GB14].iatoms[j+2];
563 if (k > i+max_offset)
569 if (k+natoms <= max_offset)
577 aadata->prologue_mask_gb[i][2*k] = 0;
578 aadata->prologue_mask_gb[i][2*k+1] = 0;
586 /* Construct the epilogue mask - this just contains the check for maxoffset */
587 snew(aadata->epilogue_mask, natoms+UNROLLI);
589 /* First zero everything to avoid uninitialized data */
590 for (i = 0; i < natoms+UNROLLI; i++)
592 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
593 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
594 aadata->epilogue_mask[i] = NULL;
597 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
599 /* Find the lowest index for which we need to use the epilogue */
601 max_offset = calc_maxoffset(imin, natoms);
603 imin = imin + 1 + max_offset;
605 /* Find largest index for which we need to use the epilogue */
606 imax = ibase + UNROLLI-1;
607 imax = (imax < end) ? imax : end;
609 max_offset = calc_maxoffset(imax, natoms);
610 imax = imax + 1 + max_offset + UNROLLJ - 1;
612 for (i = ibase; i < ibase+UNROLLI; i++)
614 /* Start of epilogue - round down to j tile limit */
615 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
616 /* Make sure we dont overlap - for small systems everything is done in the prologue */
617 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
618 /* Round upwards to j tile limit */
619 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
620 /* Make sure we dont have a negative range for the epilogue */
621 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
625 /* And fill it with data... */
627 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
629 for (i = ibase; i < ibase+UNROLLI; i++)
632 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
634 /* Allocate aligned memory */
635 snew(pi, 2*(nj+2*SIMD_WIDTH));
636 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
638 max_offset = calc_maxoffset(i, natoms);
640 for (k = 0; k < nj; k++)
642 j = aadata->jindex_gb[4*i+2] + k;
643 aadata->epilogue_mask[i][2*k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
644 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
652 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
653 gmx_localtop_t * top,
654 gmx_genborn_t * born,
656 double radius_offset,
664 gmx_allvsallgb2_data_t *aadata;
667 natoms = mdatoms->nr;
672 snew(p, 2*natoms+2*SIMD_WIDTH);
673 aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
674 snew(p, 2*natoms+2*SIMD_WIDTH);
675 aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
676 snew(p, 2*natoms+2*SIMD_WIDTH);
677 aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
678 snew(p, 2*natoms+2*SIMD_WIDTH);
679 aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
680 snew(p, 2*natoms+2*SIMD_WIDTH);
681 aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
682 snew(p, 2*natoms+2*SIMD_WIDTH);
683 aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
685 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
686 aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
688 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
689 aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
691 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
692 aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
694 for (i = 0; i < mdatoms->nr; i++)
696 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
697 if (gb_algorithm == egbSTILL)
699 aadata->workparam[i] = born->vsolv[i];
701 else if (gb_algorithm == egbOBC)
703 aadata->workparam[i] = born->param[i];
705 aadata->work[i] = 0.0;
707 for (i = 0; i < mdatoms->nr; i++)
709 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
710 aadata->workparam[natoms+i] = aadata->workparam[i];
711 aadata->work[natoms+i] = aadata->work[i];
714 for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
716 aadata->x_align[i] = 0.0;
717 aadata->y_align[i] = 0.0;
718 aadata->z_align[i] = 0.0;
719 aadata->fx_align[i] = 0.0;
720 aadata->fy_align[i] = 0.0;
721 aadata->fz_align[i] = 0.0;
724 setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
725 bInclude12, bInclude13, bInclude14);
730 * This routine apparently hits a compiler bug visual studio has had 'forever'.
731 * It is present both in VS2005 and VS2008, and the only way around it is to
732 * decrease optimization. We do that with at pragma, and only for MSVC, so it
733 * will not hurt any of the well-behaving and supported compilers out there.
734 * MS: Fix your compiler, it sucks like a black hole!
737 #pragma optimize("t",off)
741 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr,
743 gmx_genborn_t * born,
744 gmx_localtop_t * top,
749 gmx_allvsallgb2_data_t *aadata;
752 int nj0, nj1, nj2, nj3;
763 double gpi, rai, vai;
765 double irsq, idr4, idr6;
766 double raj, rvdw, ratio;
767 double vaj, ccf, dccf, theta, cosq;
768 double term, prod, icf4, icf6, gpi2, factor, sinq;
779 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
780 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
781 __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
782 __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
783 __m128d imask_SSE0, jmask_SSE0;
784 __m128d imask_SSE1, jmask_SSE1;
785 __m128d jx_SSE, jy_SSE, jz_SSE;
786 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
787 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
788 __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
789 __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
790 __m128d raj_SSE, vaj_SSE, prod_SSE;
791 __m128d rvdw_SSE0, ratio_SSE0;
792 __m128d rvdw_SSE1, ratio_SSE1;
793 __m128d theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
794 __m128d theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
795 __m128d ccf_SSE0, dccf_SSE0;
796 __m128d ccf_SSE1, dccf_SSE1;
797 __m128d icf4_SSE0, icf6_SSE0;
798 __m128d icf4_SSE1, icf6_SSE1;
799 __m128d half_SSE, one_SSE, two_SSE, four_SSE;
800 __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
802 natoms = mdatoms->nr;
804 ni1 = mdatoms->homenr;
808 aadata = *((gmx_allvsallgb2_data_t **)paadata);
813 genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
814 egbSTILL, FALSE, FALSE, TRUE);
815 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
818 x_align = aadata->x_align;
819 y_align = aadata->y_align;
820 z_align = aadata->z_align;
822 gb_radius = aadata->gb_radius;
823 vsolv = aadata->workparam;
825 jindex = aadata->jindex_gb;
828 still_p4_SSE = _mm_set1_pd(STILL_P4);
829 still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
830 still_pip5_SSE = _mm_set1_pd(STILL_PIP5);
831 half_SSE = _mm_set1_pd(0.5);
832 one_SSE = _mm_set1_pd(1.0);
833 two_SSE = _mm_set1_pd(2.0);
834 four_SSE = _mm_set1_pd(4.0);
836 /* This will be summed, so it has to extend to natoms + buffer */
837 for (i = 0; i < natoms+1+natoms/2; i++)
842 for (i = ni0; i < ni1+1+natoms/2; i++)
846 y_align[i] = x[3*k+1];
847 z_align[i] = x[3*k+2];
851 for (i = ni0; i < ni1; i += UNROLLI)
853 /* We assume shifts are NOT used for all-vs-all interactions */
854 /* Load i atom data */
855 ix_SSE0 = _mm_load1_pd(x_align+i);
856 iy_SSE0 = _mm_load1_pd(y_align+i);
857 iz_SSE0 = _mm_load1_pd(z_align+i);
858 ix_SSE1 = _mm_load1_pd(x_align+i+1);
859 iy_SSE1 = _mm_load1_pd(y_align+i+1);
860 iz_SSE1 = _mm_load1_pd(z_align+i+1);
862 gpi_SSE0 = _mm_setzero_pd();
863 gpi_SSE1 = _mm_setzero_pd();
865 rai_SSE0 = _mm_load1_pd(gb_radius+i);
866 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
868 prod_ai_SSE0 = _mm_set1_pd(STILL_P4*vsolv[i]);
869 prod_ai_SSE1 = _mm_set1_pd(STILL_P4*vsolv[i+1]);
871 /* Load limits for loop over neighbors */
877 pmask0 = aadata->prologue_mask_gb[i];
878 pmask1 = aadata->prologue_mask_gb[i+1];
879 emask0 = aadata->epilogue_mask[i];
880 emask1 = aadata->epilogue_mask[i+1];
882 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
883 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
885 /* Prologue part, including exclusion mask */
886 for (j = nj0; j < nj1; j += UNROLLJ)
888 jmask_SSE0 = _mm_load_pd((double *)pmask0);
889 jmask_SSE1 = _mm_load_pd((double *)pmask1);
893 /* load j atom coordinates */
894 jx_SSE = _mm_load_pd(x_align+j);
895 jy_SSE = _mm_load_pd(y_align+j);
896 jz_SSE = _mm_load_pd(z_align+j);
898 /* Calculate distance */
899 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
900 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
901 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
902 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
903 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
904 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
906 /* rsq = dx*dx+dy*dy+dz*dz */
907 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
908 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
911 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
912 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
914 /* Calculate 1/r and 1/r2 */
915 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
916 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
919 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
920 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
922 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
923 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
924 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
925 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
926 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
927 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
929 raj_SSE = _mm_load_pd(gb_radius+j);
930 vaj_SSE = _mm_load_pd(vsolv+j);
932 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
933 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
935 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
936 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
938 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
939 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
940 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
941 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
942 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
943 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
944 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
945 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
946 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
947 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
948 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
949 _mm_mul_pd(sinq_SSE0, theta_SSE0));
950 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
951 _mm_mul_pd(sinq_SSE1, theta_SSE1));
953 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
954 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
955 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
956 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
957 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
959 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
960 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
961 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
964 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
965 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
967 /* Save ai->aj and aj->ai chain rule terms */
968 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
970 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
973 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
975 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
979 /* Main part, no exclusions */
980 for (j = nj1; j < nj2; j += UNROLLJ)
983 /* load j atom coordinates */
984 jx_SSE = _mm_load_pd(x_align+j);
985 jy_SSE = _mm_load_pd(y_align+j);
986 jz_SSE = _mm_load_pd(z_align+j);
988 /* Calculate distance */
989 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
990 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
991 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
992 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
993 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
994 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
996 /* rsq = dx*dx+dy*dy+dz*dz */
997 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
998 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1000 /* Calculate 1/r and 1/r2 */
1001 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1002 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1005 rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
1006 rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
1008 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1009 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1010 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1011 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1012 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1013 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1015 raj_SSE = _mm_load_pd(gb_radius+j);
1017 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
1018 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
1019 vaj_SSE = _mm_load_pd(vsolv+j);
1021 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1022 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1024 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1025 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1026 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1027 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1028 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1029 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1030 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1031 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1032 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
1033 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
1034 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1035 _mm_mul_pd(sinq_SSE0, theta_SSE0));
1036 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1037 _mm_mul_pd(sinq_SSE1, theta_SSE1));
1039 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE );
1040 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1041 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1042 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1043 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1045 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1046 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1047 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1049 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1050 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1052 /* Save ai->aj and aj->ai chain rule terms */
1053 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1055 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1058 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1060 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1063 /* Epilogue part, including exclusion mask */
1064 for (j = nj2; j < nj3; j += UNROLLJ)
1066 jmask_SSE0 = _mm_load_pd((double *)emask0);
1067 jmask_SSE1 = _mm_load_pd((double *)emask1);
1068 emask0 += 2*UNROLLJ;
1069 emask1 += 2*UNROLLJ;
1071 /* load j atom coordinates */
1072 jx_SSE = _mm_load_pd(x_align+j);
1073 jy_SSE = _mm_load_pd(y_align+j);
1074 jz_SSE = _mm_load_pd(z_align+j);
1076 /* Calculate distance */
1077 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1078 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1079 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1080 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1081 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1082 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1084 /* rsq = dx*dx+dy*dy+dz*dz */
1085 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1086 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1089 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1090 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1092 /* Calculate 1/r and 1/r2 */
1093 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1094 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1097 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1098 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1100 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1101 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1102 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1103 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1104 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1105 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1107 raj_SSE = _mm_load_pd(gb_radius+j);
1108 vaj_SSE = _mm_load_pd(vsolv+j);
1110 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
1111 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
1113 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1114 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1116 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1117 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1118 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1119 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1120 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1121 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1122 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1123 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1124 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
1125 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
1126 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1127 _mm_mul_pd(sinq_SSE0, theta_SSE0));
1128 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1129 _mm_mul_pd(sinq_SSE1, theta_SSE1));
1131 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
1132 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1133 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1134 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1135 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1137 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1138 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1139 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1141 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1142 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1144 /* Save ai->aj and aj->ai chain rule terms */
1145 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1147 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1150 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1152 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1155 GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1);
1156 gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1);
1157 _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1160 /* In case we have written anything beyond natoms, move it back.
1161 * Never mind that we leave stuff above natoms; that will not
1162 * be accessed later in the routine.
1163 * In principle this should be a move rather than sum, but this
1164 * way we dont have to worry about even/odd offsets...
1166 for (i = natoms; i < ni1+1+natoms/2; i++)
1168 work[i-natoms] += work[i];
1171 /* Parallel summations would go here if ever implemented with DD */
1173 factor = 0.5 * ONE_4PI_EPS0;
1174 /* Calculate the radii - should we do all atoms, or just our local ones? */
1175 for (i = 0; i < natoms; i++)
1177 if (born->use[i] != 0)
1179 gpi = born->gpol[i]+work[i];
1181 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1182 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1188 /* Reinstate MSVC optimization */
1190 #pragma optimize("",on)
1195 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr,
1196 t_mdatoms * mdatoms,
1197 gmx_genborn_t * born,
1199 gmx_localtop_t * top,
1204 gmx_allvsallgb2_data_t *aadata;
1207 int nj0, nj1, nj2, nj3;
1224 double rad, min_rad;
1225 double rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1227 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
1228 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
1229 __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1230 __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1231 __m128d imask_SSE0, jmask_SSE0;
1232 __m128d imask_SSE1, jmask_SSE1;
1233 __m128d jx_SSE, jy_SSE, jz_SSE;
1234 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
1235 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
1236 __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1237 __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1238 __m128d raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1239 __m128d ccf_SSE0, dccf_SSE0, prod_SSE0;
1240 __m128d ccf_SSE1, dccf_SSE1, prod_SSE1;
1241 __m128d icf4_SSE0, icf6_SSE0;
1242 __m128d icf4_SSE1, icf6_SSE1;
1243 __m128d oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1244 __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1245 __m128d rai_inv_SSE0;
1246 __m128d rai_inv_SSE1;
1247 __m128d sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1248 __m128d sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1249 __m128d lij_inv_SSE0, sk2_rinv_SSE0;
1250 __m128d lij_inv_SSE1, sk2_rinv_SSE1;
1253 __m128d t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1254 __m128d t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1255 __m128d obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1256 __m128d obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1257 __m128d uij_SSE0, uij2_SSE0, uij3_SSE0;
1258 __m128d uij_SSE1, uij2_SSE1, uij3_SSE1;
1259 __m128d lij_SSE0, lij2_SSE0, lij3_SSE0;
1260 __m128d lij_SSE1, lij2_SSE1, lij3_SSE1;
1261 __m128d dlij_SSE0, diff2_SSE0, logterm_SSE0;
1262 __m128d dlij_SSE1, diff2_SSE1, logterm_SSE1;
1263 __m128d doffset_SSE, tmpSSE;
1265 natoms = mdatoms->nr;
1267 ni1 = mdatoms->homenr;
1271 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1276 genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1277 egbOBC, TRUE, TRUE, TRUE);
1278 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1281 x_align = aadata->x_align;
1282 y_align = aadata->y_align;
1283 z_align = aadata->z_align;
1285 gb_radius = aadata->gb_radius;
1286 work = aadata->work;
1287 jindex = aadata->jindex_gb;
1289 obc_param = aadata->workparam;
1291 oneeighth_SSE = _mm_set1_pd(0.125);
1292 onefourth_SSE = _mm_set1_pd(0.25);
1293 half_SSE = _mm_set1_pd(0.5);
1294 one_SSE = _mm_set1_pd(1.0);
1295 two_SSE = _mm_set1_pd(2.0);
1296 four_SSE = _mm_set1_pd(4.0);
1297 doffset_SSE = _mm_set1_pd(born->gb_doffset);
1299 for (i = 0; i < natoms; i++)
1301 x_align[i] = x[3*i];
1302 y_align[i] = x[3*i+1];
1303 z_align[i] = x[3*i+2];
1307 for (i = 0; i < natoms/2+1; i++)
1309 x_align[natoms+i] = x_align[i];
1310 y_align[natoms+i] = y_align[i];
1311 z_align[natoms+i] = z_align[i];
1314 for (i = 0; i < natoms+natoms/2+1; i++)
1319 for (i = ni0; i < ni1; i += UNROLLI)
1321 /* We assume shifts are NOT used for all-vs-all interactions */
1323 /* Load i atom data */
1324 ix_SSE0 = _mm_load1_pd(x_align+i);
1325 iy_SSE0 = _mm_load1_pd(y_align+i);
1326 iz_SSE0 = _mm_load1_pd(z_align+i);
1327 ix_SSE1 = _mm_load1_pd(x_align+i+1);
1328 iy_SSE1 = _mm_load1_pd(y_align+i+1);
1329 iz_SSE1 = _mm_load1_pd(z_align+i+1);
1331 rai_SSE0 = _mm_load1_pd(gb_radius+i);
1332 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
1333 rai_inv_SSE0 = gmx_mm_inv_pd(rai_SSE0);
1334 rai_inv_SSE1 = gmx_mm_inv_pd(rai_SSE1);
1336 sk_ai_SSE0 = _mm_load1_pd(obc_param+i);
1337 sk_ai_SSE1 = _mm_load1_pd(obc_param+i+1);
1338 sk2_ai_SSE0 = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0);
1339 sk2_ai_SSE1 = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1);
1341 sum_ai_SSE0 = _mm_setzero_pd();
1342 sum_ai_SSE1 = _mm_setzero_pd();
1344 /* Load limits for loop over neighbors */
1346 nj1 = jindex[4*i+1];
1347 nj2 = jindex[4*i+2];
1348 nj3 = jindex[4*i+3];
1350 pmask0 = aadata->prologue_mask_gb[i];
1351 pmask1 = aadata->prologue_mask_gb[i+1];
1352 emask0 = aadata->epilogue_mask[i];
1353 emask1 = aadata->epilogue_mask[i+1];
1355 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
1356 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1358 /* Prologue part, including exclusion mask */
1359 for (j = nj0; j < nj1; j += UNROLLJ)
1361 jmask_SSE0 = _mm_load_pd((double *)pmask0);
1362 jmask_SSE1 = _mm_load_pd((double *)pmask1);
1363 pmask0 += 2*UNROLLJ;
1364 pmask1 += 2*UNROLLJ;
1366 /* load j atom coordinates */
1367 jx_SSE = _mm_load_pd(x_align+j);
1368 jy_SSE = _mm_load_pd(y_align+j);
1369 jz_SSE = _mm_load_pd(z_align+j);
1371 /* Calculate distance */
1372 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1373 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1374 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1375 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1376 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1377 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1379 /* rsq = dx*dx+dy*dy+dz*dz */
1380 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1381 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1384 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1385 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1387 /* Calculate 1/r and 1/r2 */
1388 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1389 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1392 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1393 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1395 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1396 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1398 sk_aj_SSE = _mm_load_pd(obc_param+j);
1399 raj_SSE = _mm_load_pd(gb_radius+j);
1400 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1402 /* Evaluate influence of atom aj -> ai */
1403 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1404 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1405 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1406 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1407 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1408 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1410 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1411 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1412 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1413 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1414 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1415 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1416 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1417 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1419 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1420 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1421 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1422 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1423 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1424 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1425 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1426 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1428 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1429 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1430 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1431 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1432 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1433 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1434 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1435 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1437 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1438 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1439 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1440 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1441 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1442 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1443 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1444 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1445 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1447 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1448 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1450 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1451 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1452 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1453 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1455 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1456 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1459 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1460 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1461 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1462 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1463 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1464 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1465 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1466 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1467 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1468 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1470 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1471 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1473 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1474 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1475 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1476 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1477 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1478 _mm_mul_pd(onefourth_SSE,
1479 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1480 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1481 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1482 _mm_mul_pd(onefourth_SSE,
1483 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1484 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1486 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1487 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1488 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1489 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1490 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1491 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1492 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1493 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1494 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1495 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1496 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1497 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1498 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1499 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1500 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1501 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1502 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1503 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1505 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1506 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1507 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1509 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1511 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1512 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1513 _mm_add_pd(t2_SSE0, t3_SSE0)));
1514 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1515 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1516 _mm_add_pd(t2_SSE1, t3_SSE1)));
1518 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1520 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1523 /* Evaluate influence of atom ai -> aj */
1524 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1525 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1526 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1527 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1528 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1529 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1531 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1532 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1533 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1534 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1535 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1536 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1537 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1538 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1540 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1541 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1542 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1543 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1544 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1545 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1546 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1547 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1549 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1550 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1551 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1552 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1553 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1554 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1555 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1556 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1558 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1559 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1560 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1561 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1562 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1563 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1564 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1565 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1567 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1568 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1569 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1570 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1571 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1572 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1574 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1575 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1577 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1578 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1579 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1580 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1581 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1582 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1583 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1584 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1585 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1586 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1588 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1589 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1590 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1592 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1593 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1594 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1595 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1596 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1597 _mm_mul_pd(onefourth_SSE,
1598 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1599 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1600 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1601 _mm_mul_pd(onefourth_SSE,
1602 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1603 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1604 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1605 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1606 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1607 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1608 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1609 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1610 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1611 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1612 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1613 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1614 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1615 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1617 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1618 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1619 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1620 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1622 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1623 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1625 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1626 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1627 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1629 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1632 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1633 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1634 _mm_add_pd(t2_SSE0, t3_SSE0)));
1635 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1636 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1637 _mm_add_pd(t2_SSE1, t3_SSE1)));
1639 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1641 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1645 /* Main part, no exclusions */
1646 for (j = nj1; j < nj2; j += UNROLLJ)
1648 /* load j atom coordinates */
1649 jx_SSE = _mm_load_pd(x_align+j);
1650 jy_SSE = _mm_load_pd(y_align+j);
1651 jz_SSE = _mm_load_pd(z_align+j);
1653 /* Calculate distance */
1654 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1655 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1656 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1657 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1658 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1659 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1661 /* rsq = dx*dx+dy*dy+dz*dz */
1662 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1663 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1665 /* Calculate 1/r and 1/r2 */
1666 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1667 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1670 rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
1671 rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
1673 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1674 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1676 sk_aj_SSE = _mm_load_pd(obc_param+j);
1677 raj_SSE = _mm_load_pd(gb_radius+j);
1679 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1681 /* Evaluate influence of atom aj -> ai */
1682 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1683 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1684 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1685 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1686 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1687 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1689 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1690 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1691 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1692 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1693 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1694 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1695 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1696 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1698 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1699 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1700 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1701 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1702 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1703 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1704 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1705 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1707 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1708 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1709 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1710 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1711 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1712 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1713 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1714 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1716 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1717 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1718 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1719 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1720 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1721 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1722 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1723 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1724 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1726 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1727 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1729 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1730 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1731 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1732 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1734 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1735 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1738 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1739 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1740 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1741 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1742 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1743 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1744 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1745 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1746 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1747 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1749 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1750 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1752 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1753 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1754 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1755 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1757 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1758 _mm_mul_pd(onefourth_SSE,
1759 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1760 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1761 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1762 _mm_mul_pd(onefourth_SSE,
1763 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1764 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1766 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1767 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1768 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1769 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1770 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1771 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1772 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1773 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1774 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1775 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1776 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1777 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1778 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1779 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1780 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1781 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1782 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1783 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1785 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1786 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1787 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1789 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1791 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1792 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1793 _mm_add_pd(t2_SSE0, t3_SSE0)));
1794 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1795 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1796 _mm_add_pd(t2_SSE1, t3_SSE1)));
1798 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1800 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1803 /* Evaluate influence of atom ai -> aj */
1804 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1805 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1806 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1807 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1808 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1809 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1811 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1812 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1813 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1814 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1815 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1816 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1817 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1818 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1820 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1821 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1822 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1823 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1824 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1825 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1826 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1827 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1829 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1830 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1831 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1832 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1833 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1834 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1835 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1836 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1838 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1839 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1840 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1841 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1842 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1843 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1844 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1845 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1847 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1848 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1849 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1850 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1851 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1852 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1854 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1855 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1857 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1858 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1859 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1860 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1861 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1862 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1863 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1864 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1865 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1866 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1868 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1869 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1870 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1872 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1873 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1874 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1875 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1876 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1877 _mm_mul_pd(onefourth_SSE,
1878 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1879 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1880 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1881 _mm_mul_pd(onefourth_SSE,
1882 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1883 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1884 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1885 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1886 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1887 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1888 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1889 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1890 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1891 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1892 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1893 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1894 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1895 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1897 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1898 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1899 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1900 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1902 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1903 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1905 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1906 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1907 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1909 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1911 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1912 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1913 _mm_add_pd(t2_SSE0, t3_SSE0)));
1914 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1915 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1916 _mm_add_pd(t2_SSE1, t3_SSE1)));
1918 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1920 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1924 /* Epilogue part, including exclusion mask */
1925 for (j = nj2; j < nj3; j += UNROLLJ)
1927 jmask_SSE0 = _mm_load_pd((double *)emask0);
1928 jmask_SSE1 = _mm_load_pd((double *)emask1);
1929 emask0 += 2*UNROLLJ;
1930 emask1 += 2*UNROLLJ;
1932 /* load j atom coordinates */
1933 jx_SSE = _mm_load_pd(x_align+j);
1934 jy_SSE = _mm_load_pd(y_align+j);
1935 jz_SSE = _mm_load_pd(z_align+j);
1937 /* Calculate distance */
1938 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1939 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1940 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1941 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1942 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1943 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1945 /* rsq = dx*dx+dy*dy+dz*dz */
1946 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1947 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1950 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1951 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1953 /* Calculate 1/r and 1/r2 */
1954 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1955 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1958 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1959 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1961 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1962 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1964 sk_aj_SSE = _mm_load_pd(obc_param+j);
1965 raj_SSE = _mm_load_pd(gb_radius+j);
1967 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1969 /* Evaluate influence of atom aj -> ai */
1970 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1971 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1972 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1973 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1974 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1975 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1977 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1978 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1979 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1980 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1981 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1982 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1983 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1984 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1986 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1987 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1988 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1989 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1990 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1991 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1993 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1994 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1996 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1997 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1998 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1999 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2000 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2001 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2002 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2003 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2005 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2006 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2007 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2008 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2009 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
2010 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
2011 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
2012 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2013 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2015 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2016 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2018 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
2019 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
2020 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2021 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2023 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2024 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2027 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2028 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2029 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2030 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2031 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
2032 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
2033 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2034 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2035 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2036 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2038 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2039 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2041 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2042 _mm_mul_pd(prod_SSE0, lij3_SSE0));
2043 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2044 _mm_mul_pd(prod_SSE1, lij3_SSE1));
2045 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2046 _mm_mul_pd(onefourth_SSE,
2047 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2048 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2049 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2050 _mm_mul_pd(onefourth_SSE,
2051 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2052 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2054 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2055 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2056 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2057 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2058 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2059 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2060 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2061 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2062 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2063 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2064 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2065 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2066 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2067 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2068 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2069 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2070 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2071 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2073 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2074 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2075 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2077 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2079 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2080 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2081 _mm_add_pd(t2_SSE0, t3_SSE0)));
2082 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2083 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2084 _mm_add_pd(t2_SSE1, t3_SSE1)));
2086 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2088 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2091 /* Evaluate influence of atom ai -> aj */
2092 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
2093 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
2094 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
2095 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
2096 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
2097 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
2099 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2100 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2101 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2102 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2103 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2104 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2105 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
2106 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
2108 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
2109 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
2110 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
2111 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
2112 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
2113 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
2115 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
2116 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
2118 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
2119 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
2120 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2121 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2122 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2123 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2124 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2125 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2127 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2128 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2129 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2130 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2131 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
2132 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
2133 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2134 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2136 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2137 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2138 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
2139 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
2140 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2141 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2143 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2144 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2146 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2147 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2148 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2149 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2150 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
2151 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
2152 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2153 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2154 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2155 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2157 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2158 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
2159 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
2161 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2162 _mm_mul_pd(prod_SSE0, lij3_SSE0));
2163 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2164 _mm_mul_pd(prod_SSE1, lij3_SSE1));
2166 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2167 _mm_mul_pd(onefourth_SSE,
2168 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2169 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2170 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2171 _mm_mul_pd(onefourth_SSE,
2172 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2173 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2174 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2175 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2176 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2177 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2178 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2179 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2180 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2181 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2182 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2183 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2184 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2185 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2187 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2188 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2189 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2190 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2192 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2193 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2195 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2196 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2197 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2199 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2201 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2202 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2203 _mm_add_pd(t2_SSE0, t3_SSE0)));
2204 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2205 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2206 _mm_add_pd(t2_SSE1, t3_SSE1)));
2208 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2210 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2213 GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1);
2214 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1);
2215 _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2219 for (i = 0; i < natoms/2+1; i++)
2221 work[i] += work[natoms+i];
2224 /* Parallel summations would go here if ever implemented in DD */
2226 if (gb_algorithm == egbHCT)
2229 for (i = 0; i < natoms; i++)
2231 if (born->use[i] != 0)
2233 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
2234 sum_ai = 1.0/rai - work[i];
2235 min_rad = rai + born->gb_doffset;
2238 born->bRad[i] = rad > min_rad ? rad : min_rad;
2239 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2248 /* Calculate the radii */
2249 for (i = 0; i < natoms; i++)
2252 if (born->use[i] != 0)
2254 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2256 rai = rai-born->gb_doffset;
2258 sum_ai = rai * work[i];
2259 sum_ai2 = sum_ai * sum_ai;
2260 sum_ai3 = sum_ai2 * sum_ai;
2262 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2263 born->bRad[i] = rai_inv - tsum*rai_inv2;
2264 born->bRad[i] = 1.0 / born->bRad[i];
2266 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2268 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2269 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2285 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr,
2286 t_mdatoms * mdatoms,
2287 gmx_genborn_t * born,
2293 gmx_allvsallgb2_data_t *aadata;
2296 int nj0, nj1, nj2, nj3;
2305 double fix, fiy, fiz;
2309 double rbai, rbaj, fgb, fgb_ai, rbi;
2320 __m128d jmask_SSE0, jmask_SSE1;
2321 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
2322 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
2323 __m128d fix_SSE0, fiy_SSE0, fiz_SSE0;
2324 __m128d fix_SSE1, fiy_SSE1, fiz_SSE1;
2325 __m128d rbai_SSE0, rbai_SSE1;
2326 __m128d imask_SSE0, imask_SSE1;
2327 __m128d jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
2328 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
2329 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
2330 __m128d fgb_SSE0, fgb_ai_SSE0;
2331 __m128d fgb_SSE1, fgb_ai_SSE1;
2332 __m128d tx_SSE0, ty_SSE0, tz_SSE0;
2333 __m128d tx_SSE1, ty_SSE1, tz_SSE1;
2334 __m128d t1, t2, tmpSSE;
2336 natoms = mdatoms->nr;
2338 ni1 = mdatoms->homenr;
2340 aadata = (gmx_allvsallgb2_data_t *)paadata;
2342 x_align = aadata->x_align;
2343 y_align = aadata->y_align;
2344 z_align = aadata->z_align;
2345 fx_align = aadata->fx_align;
2346 fy_align = aadata->fy_align;
2347 fz_align = aadata->fz_align;
2349 jindex = aadata->jindex_gb;
2355 /* Loop to get the proper form for the Born radius term */
2356 if (gb_algorithm == egbSTILL)
2358 for (i = 0; i < natoms; i++)
2360 rbi = born->bRad[i];
2361 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2364 else if (gb_algorithm == egbHCT)
2366 for (i = 0; i < natoms; i++)
2368 rbi = born->bRad[i];
2369 rb[i] = rbi * rbi * fr->dvda[i];
2372 else if (gb_algorithm == egbOBC)
2374 for (idx = 0; idx < natoms; idx++)
2376 rbi = born->bRad[idx];
2377 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2381 for (i = 0; i < 2*natoms; i++)
2389 for (i = 0; i < natoms; i++)
2391 rb[i+natoms] = rb[i];
2394 for (i = ni0; i < ni1; i += UNROLLI)
2396 /* We assume shifts are NOT used for all-vs-all interactions */
2398 /* Load i atom data */
2399 ix_SSE0 = _mm_load1_pd(x_align+i);
2400 iy_SSE0 = _mm_load1_pd(y_align+i);
2401 iz_SSE0 = _mm_load1_pd(z_align+i);
2402 ix_SSE1 = _mm_load1_pd(x_align+i+1);
2403 iy_SSE1 = _mm_load1_pd(y_align+i+1);
2404 iz_SSE1 = _mm_load1_pd(z_align+i+1);
2406 fix_SSE0 = _mm_setzero_pd();
2407 fiy_SSE0 = _mm_setzero_pd();
2408 fiz_SSE0 = _mm_setzero_pd();
2409 fix_SSE1 = _mm_setzero_pd();
2410 fiy_SSE1 = _mm_setzero_pd();
2411 fiz_SSE1 = _mm_setzero_pd();
2413 rbai_SSE0 = _mm_load1_pd(rb+i);
2414 rbai_SSE1 = _mm_load1_pd(rb+i+1);
2416 /* Load limits for loop over neighbors */
2418 nj3 = jindex[4*i+3];
2420 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2421 for (j = nj0; j < nj3; j += UNROLLJ)
2423 /* load j atom coordinates */
2424 jx_SSE = _mm_load_pd(x_align+j);
2425 jy_SSE = _mm_load_pd(y_align+j);
2426 jz_SSE = _mm_load_pd(z_align+j);
2428 /* Calculate distance */
2429 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
2430 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
2431 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
2432 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
2433 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
2434 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
2436 rbaj_SSE = _mm_load_pd(rb+j);
2438 fgb_SSE0 = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx));
2440 fgb_SSE1 = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx));
2443 fgb_ai_SSE0 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2445 fgb_ai_SSE1 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2448 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2449 fgb_SSE0 = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0);
2450 fgb_SSE1 = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1);
2452 /* Calculate temporary vectorial force */
2453 tx_SSE0 = _mm_mul_pd(fgb_SSE0, dx_SSE0);
2454 ty_SSE0 = _mm_mul_pd(fgb_SSE0, dy_SSE0);
2455 tz_SSE0 = _mm_mul_pd(fgb_SSE0, dz_SSE0);
2456 tx_SSE1 = _mm_mul_pd(fgb_SSE1, dx_SSE1);
2457 ty_SSE1 = _mm_mul_pd(fgb_SSE1, dy_SSE1);
2458 tz_SSE1 = _mm_mul_pd(fgb_SSE1, dz_SSE1);
2460 /* Increment i atom force */
2461 fix_SSE0 = _mm_add_pd(fix_SSE0, tx_SSE0);
2462 fiy_SSE0 = _mm_add_pd(fiy_SSE0, ty_SSE0);
2463 fiz_SSE0 = _mm_add_pd(fiz_SSE0, tz_SSE0);
2464 fix_SSE1 = _mm_add_pd(fix_SSE1, tx_SSE1);
2465 fiy_SSE1 = _mm_add_pd(fiy_SSE1, ty_SSE1);
2466 fiz_SSE1 = _mm_add_pd(fiz_SSE1, tz_SSE1);
2468 /* Decrement j atom force */
2469 _mm_store_pd(fx_align+j,
2470 _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) ));
2471 _mm_store_pd(fy_align+j,
2472 _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) ));
2473 _mm_store_pd(fz_align+j,
2474 _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) ));
2477 /* Add i forces to mem */
2478 GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1);
2479 fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1);
2480 _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2482 GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1);
2483 fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1);
2484 _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2486 GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1);
2487 fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1);
2488 _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
2491 for (i = 0; i < natoms; i++)
2493 f[3*i] += fx_align[i] + fx_align[natoms+i];
2494 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
2495 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
2502 /* dummy variable when not using SSE */
2503 int genborn_allvsall_sse2_double_dummy;