2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team.
6 * Copyright (c) 2012,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/utility/smalloc.h"
47 #include "gromacs/legacyheaders/network.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/legacyheaders/genborn.h"
50 #include "genborn_allvsall.h"
53 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
55 #include <gmx_sse2_double.h>
73 int ** prologue_mask_gb;
86 gmx_allvsallgb2_data_t;
90 calc_maxoffset(int i, int natoms)
94 if ((natoms % 2) == 1)
96 /* Odd number of atoms, easy */
99 else if ((natoms % 4) == 0)
101 /* Multiple of four is hard */
106 maxoffset = natoms/2;
110 maxoffset = natoms/2-1;
117 maxoffset = natoms/2;
121 maxoffset = natoms/2-1;
130 maxoffset = natoms/2;
134 maxoffset = natoms/2-1;
142 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
153 int ni0, ni1, nj0, nj1, nj;
154 int imin, imax, iexcl;
157 int firstinteraction;
161 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
162 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
163 * whether they should interact or not.
165 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
166 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
167 * we create a jindex array with three elements per i atom: the starting point, the point to
168 * which we need to check exclusions, and the end point.
169 * This way we only have to allocate a short exclusion mask per i atom.
172 ni0 = (start/UNROLLI)*UNROLLI;
173 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
175 /* Set the interaction mask to only enable the i atoms we want to include */
176 snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH));
177 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
178 for (i = 0; i < natoms+UNROLLI; i++)
180 aadata->imask[2*i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
181 aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
184 /* Allocate memory for our modified jindex array */
185 snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
186 for (i = 0; i < 4*(natoms+UNROLLI); i++)
188 aadata->jindex_gb[i] = 0;
191 /* Create the exclusion masks for the prologue part */
192 snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
194 /* First zero everything to avoid uninitialized data */
195 for (i = 0; i < natoms+UNROLLI; i++)
197 aadata->prologue_mask_gb[i] = NULL;
200 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
201 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
203 max_excl_offset = -1;
205 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
206 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
208 /* Which atom is the first we (might) interact with? */
209 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
210 for (i = ibase; i < imax; i++)
212 /* Before exclusions, which atom is the first we (might) interact with? */
213 firstinteraction = i+1;
214 max_offset = calc_maxoffset(i, natoms);
218 for (j = 0; j < ilist[F_GB12].nr; j += 3)
220 a1 = ilist[F_GB12].iatoms[j+1];
221 a2 = ilist[F_GB12].iatoms[j+2];
236 if (k == firstinteraction)
244 for (j = 0; j < ilist[F_GB13].nr; j += 3)
246 a1 = ilist[F_GB13].iatoms[j+1];
247 a2 = ilist[F_GB13].iatoms[j+2];
262 if (k == firstinteraction)
270 for (j = 0; j < ilist[F_GB14].nr; j += 3)
272 a1 = ilist[F_GB14].iatoms[j+1];
273 a2 = ilist[F_GB14].iatoms[j+2];
287 if (k == firstinteraction)
293 imin = (firstinteraction < imin) ? firstinteraction : imin;
295 /* round down to j unrolling factor */
296 imin = (imin/UNROLLJ)*UNROLLJ;
298 for (i = ibase; i < imax; i++)
300 max_offset = calc_maxoffset(i, natoms);
304 for (j = 0; j < ilist[F_GB12].nr; j += 3)
306 a1 = ilist[F_GB12].iatoms[j+1];
307 a2 = ilist[F_GB12].iatoms[j+2];
327 if (k > i+max_offset)
334 if (k+natoms <= max_offset)
338 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
343 for (j = 0; j < ilist[F_GB13].nr; j += 3)
345 a1 = ilist[F_GB13].iatoms[j+1];
346 a2 = ilist[F_GB13].iatoms[j+2];
366 if (k > i+max_offset)
373 if (k+natoms <= max_offset)
377 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
382 for (j = 0; j < ilist[F_GB14].nr; j += 3)
384 a1 = ilist[F_GB14].iatoms[j+1];
385 a2 = ilist[F_GB14].iatoms[j+2];
405 if (k > i+max_offset)
412 if (k+natoms <= max_offset)
416 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
421 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
423 /* round up to j unrolling factor */
424 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
426 /* Set all the prologue masks length to this value (even for i>end) */
427 for (i = ibase; i < ibase+UNROLLI; i++)
429 aadata->jindex_gb[4*i] = imin;
430 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
434 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
435 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
437 for (i = ibase; i < ibase+UNROLLI; i++)
439 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
440 imin = aadata->jindex_gb[4*i];
442 /* Allocate aligned memory */
443 snew(pi, 2*(nj+2*SIMD_WIDTH));
444 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
446 max_offset = calc_maxoffset(i, natoms);
448 /* Include interactions i+1 <= j < i+maxoffset */
449 for (k = 0; k < nj; k++)
453 if ( (j > i) && (j <= i+max_offset) )
455 aadata->prologue_mask_gb[i][2*k] = 0xFFFFFFFF;
456 aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
460 aadata->prologue_mask_gb[i][2*k] = 0;
461 aadata->prologue_mask_gb[i][2*k+1] = 0;
465 /* Clear out the explicit exclusions */
470 for (j = 0; j < ilist[F_GB12].nr; j += 3)
472 a1 = ilist[F_GB12].iatoms[j+1];
473 a2 = ilist[F_GB12].iatoms[j+2];
488 if (k > i+max_offset)
494 if (k+natoms <= max_offset)
502 aadata->prologue_mask_gb[i][2*k] = 0;
503 aadata->prologue_mask_gb[i][2*k+1] = 0;
509 for (j = 0; j < ilist[F_GB13].nr; j += 3)
511 a1 = ilist[F_GB13].iatoms[j+1];
512 a2 = ilist[F_GB13].iatoms[j+2];
527 if (k > i+max_offset)
533 if (k+natoms <= max_offset)
541 aadata->prologue_mask_gb[i][2*k] = 0;
542 aadata->prologue_mask_gb[i][2*k+1] = 0;
548 for (j = 0; j < ilist[F_GB14].nr; j += 3)
550 a1 = ilist[F_GB14].iatoms[j+1];
551 a2 = ilist[F_GB14].iatoms[j+2];
566 if (k > i+max_offset)
572 if (k+natoms <= max_offset)
580 aadata->prologue_mask_gb[i][2*k] = 0;
581 aadata->prologue_mask_gb[i][2*k+1] = 0;
589 /* Construct the epilogue mask - this just contains the check for maxoffset */
590 snew(aadata->epilogue_mask, natoms+UNROLLI);
592 /* First zero everything to avoid uninitialized data */
593 for (i = 0; i < natoms+UNROLLI; i++)
595 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
596 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
597 aadata->epilogue_mask[i] = NULL;
600 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
602 /* Find the lowest index for which we need to use the epilogue */
604 max_offset = calc_maxoffset(imin, natoms);
606 imin = imin + 1 + max_offset;
608 /* Find largest index for which we need to use the epilogue */
609 imax = ibase + UNROLLI-1;
610 imax = (imax < end) ? imax : end;
612 max_offset = calc_maxoffset(imax, natoms);
613 imax = imax + 1 + max_offset + UNROLLJ - 1;
615 for (i = ibase; i < ibase+UNROLLI; i++)
617 /* Start of epilogue - round down to j tile limit */
618 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
619 /* Make sure we dont overlap - for small systems everything is done in the prologue */
620 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
621 /* Round upwards to j tile limit */
622 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
623 /* Make sure we dont have a negative range for the epilogue */
624 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
628 /* And fill it with data... */
630 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
632 for (i = ibase; i < ibase+UNROLLI; i++)
635 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
637 /* Allocate aligned memory */
638 snew(pi, 2*(nj+2*SIMD_WIDTH));
639 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
641 max_offset = calc_maxoffset(i, natoms);
643 for (k = 0; k < nj; k++)
645 j = aadata->jindex_gb[4*i+2] + k;
646 aadata->epilogue_mask[i][2*k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
647 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
655 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
656 gmx_localtop_t * top,
657 gmx_genborn_t * born,
659 double radius_offset,
667 gmx_allvsallgb2_data_t *aadata;
670 natoms = mdatoms->nr;
675 snew(p, 2*natoms+2*SIMD_WIDTH);
676 aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
677 snew(p, 2*natoms+2*SIMD_WIDTH);
678 aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
679 snew(p, 2*natoms+2*SIMD_WIDTH);
680 aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
681 snew(p, 2*natoms+2*SIMD_WIDTH);
682 aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
683 snew(p, 2*natoms+2*SIMD_WIDTH);
684 aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
685 snew(p, 2*natoms+2*SIMD_WIDTH);
686 aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
688 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
689 aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
691 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
692 aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
694 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
695 aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
697 for (i = 0; i < mdatoms->nr; i++)
699 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
700 if (gb_algorithm == egbSTILL)
702 aadata->workparam[i] = born->vsolv[i];
704 else if (gb_algorithm == egbOBC)
706 aadata->workparam[i] = born->param[i];
708 aadata->work[i] = 0.0;
710 for (i = 0; i < mdatoms->nr; i++)
712 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
713 aadata->workparam[natoms+i] = aadata->workparam[i];
714 aadata->work[natoms+i] = aadata->work[i];
717 for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
719 aadata->x_align[i] = 0.0;
720 aadata->y_align[i] = 0.0;
721 aadata->z_align[i] = 0.0;
722 aadata->fx_align[i] = 0.0;
723 aadata->fy_align[i] = 0.0;
724 aadata->fz_align[i] = 0.0;
727 setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
728 bInclude12, bInclude13, bInclude14);
733 * This routine apparently hits a compiler bug visual studio has had 'forever'.
734 * It is present both in VS2005 and VS2008, and the only way around it is to
735 * decrease optimization. We do that with at pragma, and only for MSVC, so it
736 * will not hurt any of the well-behaving and supported compilers out there.
737 * MS: Fix your compiler, it sucks like a black hole!
740 #pragma optimize("t",off)
744 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr,
746 gmx_genborn_t * born,
747 gmx_localtop_t * top,
752 gmx_allvsallgb2_data_t *aadata;
755 int nj0, nj1, nj2, nj3;
766 double gpi, rai, vai;
768 double irsq, idr4, idr6;
769 double raj, rvdw, ratio;
770 double vaj, ccf, dccf, theta, cosq;
771 double term, prod, icf4, icf6, gpi2, factor, sinq;
782 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
783 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
784 __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
785 __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
786 __m128d imask_SSE0, jmask_SSE0;
787 __m128d imask_SSE1, jmask_SSE1;
788 __m128d jx_SSE, jy_SSE, jz_SSE;
789 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
790 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
791 __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
792 __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
793 __m128d raj_SSE, vaj_SSE, prod_SSE;
794 __m128d rvdw_SSE0, ratio_SSE0;
795 __m128d rvdw_SSE1, ratio_SSE1;
796 __m128d theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
797 __m128d theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
798 __m128d ccf_SSE0, dccf_SSE0;
799 __m128d ccf_SSE1, dccf_SSE1;
800 __m128d icf4_SSE0, icf6_SSE0;
801 __m128d icf4_SSE1, icf6_SSE1;
802 __m128d half_SSE, one_SSE, two_SSE, four_SSE;
803 __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
805 natoms = mdatoms->nr;
807 ni1 = mdatoms->homenr;
811 aadata = *((gmx_allvsallgb2_data_t **)paadata);
816 genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
817 egbSTILL, FALSE, FALSE, TRUE);
818 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
821 x_align = aadata->x_align;
822 y_align = aadata->y_align;
823 z_align = aadata->z_align;
825 gb_radius = aadata->gb_radius;
826 vsolv = aadata->workparam;
828 jindex = aadata->jindex_gb;
831 still_p4_SSE = _mm_set1_pd(STILL_P4);
832 still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
833 still_pip5_SSE = _mm_set1_pd(STILL_PIP5);
834 half_SSE = _mm_set1_pd(0.5);
835 one_SSE = _mm_set1_pd(1.0);
836 two_SSE = _mm_set1_pd(2.0);
837 four_SSE = _mm_set1_pd(4.0);
839 /* This will be summed, so it has to extend to natoms + buffer */
840 for (i = 0; i < natoms+1+natoms/2; i++)
845 for (i = ni0; i < ni1+1+natoms/2; i++)
849 y_align[i] = x[3*k+1];
850 z_align[i] = x[3*k+2];
854 for (i = ni0; i < ni1; i += UNROLLI)
856 /* We assume shifts are NOT used for all-vs-all interactions */
857 /* Load i atom data */
858 ix_SSE0 = _mm_load1_pd(x_align+i);
859 iy_SSE0 = _mm_load1_pd(y_align+i);
860 iz_SSE0 = _mm_load1_pd(z_align+i);
861 ix_SSE1 = _mm_load1_pd(x_align+i+1);
862 iy_SSE1 = _mm_load1_pd(y_align+i+1);
863 iz_SSE1 = _mm_load1_pd(z_align+i+1);
865 gpi_SSE0 = _mm_setzero_pd();
866 gpi_SSE1 = _mm_setzero_pd();
868 rai_SSE0 = _mm_load1_pd(gb_radius+i);
869 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
871 prod_ai_SSE0 = _mm_set1_pd(STILL_P4*vsolv[i]);
872 prod_ai_SSE1 = _mm_set1_pd(STILL_P4*vsolv[i+1]);
874 /* Load limits for loop over neighbors */
880 pmask0 = aadata->prologue_mask_gb[i];
881 pmask1 = aadata->prologue_mask_gb[i+1];
882 emask0 = aadata->epilogue_mask[i];
883 emask1 = aadata->epilogue_mask[i+1];
885 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
886 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
888 /* Prologue part, including exclusion mask */
889 for (j = nj0; j < nj1; j += UNROLLJ)
891 jmask_SSE0 = _mm_load_pd((double *)pmask0);
892 jmask_SSE1 = _mm_load_pd((double *)pmask1);
896 /* load j atom coordinates */
897 jx_SSE = _mm_load_pd(x_align+j);
898 jy_SSE = _mm_load_pd(y_align+j);
899 jz_SSE = _mm_load_pd(z_align+j);
901 /* Calculate distance */
902 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
903 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
904 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
905 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
906 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
907 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
909 /* rsq = dx*dx+dy*dy+dz*dz */
910 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
911 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
914 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
915 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
917 /* Calculate 1/r and 1/r2 */
918 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
919 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
922 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
923 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
925 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
926 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
927 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
928 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
929 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
930 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
932 raj_SSE = _mm_load_pd(gb_radius+j);
933 vaj_SSE = _mm_load_pd(vsolv+j);
935 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
936 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
938 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
939 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
941 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
942 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
943 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
944 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
945 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
946 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
947 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
948 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
949 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
950 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
951 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
952 _mm_mul_pd(sinq_SSE0, theta_SSE0));
953 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
954 _mm_mul_pd(sinq_SSE1, theta_SSE1));
956 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
957 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
958 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
959 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
960 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
962 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
963 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
964 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
967 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
968 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
970 /* Save ai->aj and aj->ai chain rule terms */
971 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
973 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
976 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
978 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
982 /* Main part, no exclusions */
983 for (j = nj1; j < nj2; j += UNROLLJ)
986 /* load j atom coordinates */
987 jx_SSE = _mm_load_pd(x_align+j);
988 jy_SSE = _mm_load_pd(y_align+j);
989 jz_SSE = _mm_load_pd(z_align+j);
991 /* Calculate distance */
992 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
993 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
994 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
995 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
996 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
997 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
999 /* rsq = dx*dx+dy*dy+dz*dz */
1000 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1001 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1003 /* Calculate 1/r and 1/r2 */
1004 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1005 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1008 rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
1009 rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
1011 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1012 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1013 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1014 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1015 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1016 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1018 raj_SSE = _mm_load_pd(gb_radius+j);
1020 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
1021 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
1022 vaj_SSE = _mm_load_pd(vsolv+j);
1024 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1025 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1027 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1028 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1029 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1030 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1031 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1032 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1033 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1034 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1035 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
1036 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
1037 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1038 _mm_mul_pd(sinq_SSE0, theta_SSE0));
1039 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1040 _mm_mul_pd(sinq_SSE1, theta_SSE1));
1042 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE );
1043 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1044 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1045 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1046 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1048 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1049 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1050 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1052 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1053 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1055 /* Save ai->aj and aj->ai chain rule terms */
1056 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1058 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1061 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1063 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1066 /* Epilogue part, including exclusion mask */
1067 for (j = nj2; j < nj3; j += UNROLLJ)
1069 jmask_SSE0 = _mm_load_pd((double *)emask0);
1070 jmask_SSE1 = _mm_load_pd((double *)emask1);
1071 emask0 += 2*UNROLLJ;
1072 emask1 += 2*UNROLLJ;
1074 /* load j atom coordinates */
1075 jx_SSE = _mm_load_pd(x_align+j);
1076 jy_SSE = _mm_load_pd(y_align+j);
1077 jz_SSE = _mm_load_pd(z_align+j);
1079 /* Calculate distance */
1080 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1081 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1082 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1083 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1084 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1085 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1087 /* rsq = dx*dx+dy*dy+dz*dz */
1088 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1089 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1092 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1093 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1095 /* Calculate 1/r and 1/r2 */
1096 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1097 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1100 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1101 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1103 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1104 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1105 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1106 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1107 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1108 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1110 raj_SSE = _mm_load_pd(gb_radius+j);
1111 vaj_SSE = _mm_load_pd(vsolv+j);
1113 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
1114 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
1116 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1117 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1119 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1120 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1121 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1122 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1123 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1124 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1125 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1126 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1127 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
1128 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
1129 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1130 _mm_mul_pd(sinq_SSE0, theta_SSE0));
1131 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1132 _mm_mul_pd(sinq_SSE1, theta_SSE1));
1134 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
1135 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1136 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1137 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1138 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1140 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1141 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1142 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1144 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1145 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1147 /* Save ai->aj and aj->ai chain rule terms */
1148 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1150 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1153 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1155 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1158 GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1);
1159 gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1);
1160 _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1163 /* In case we have written anything beyond natoms, move it back.
1164 * Never mind that we leave stuff above natoms; that will not
1165 * be accessed later in the routine.
1166 * In principle this should be a move rather than sum, but this
1167 * way we dont have to worry about even/odd offsets...
1169 for (i = natoms; i < ni1+1+natoms/2; i++)
1171 work[i-natoms] += work[i];
1174 /* Parallel summations would go here if ever implemented with DD */
1176 factor = 0.5 * ONE_4PI_EPS0;
1177 /* Calculate the radii - should we do all atoms, or just our local ones? */
1178 for (i = 0; i < natoms; i++)
1180 if (born->use[i] != 0)
1182 gpi = born->gpol[i]+work[i];
1184 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1185 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1191 /* Reinstate MSVC optimization */
1193 #pragma optimize("",on)
1198 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr,
1199 t_mdatoms * mdatoms,
1200 gmx_genborn_t * born,
1202 gmx_localtop_t * top,
1207 gmx_allvsallgb2_data_t *aadata;
1210 int nj0, nj1, nj2, nj3;
1227 double rad, min_rad;
1228 double rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1230 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
1231 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
1232 __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1233 __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1234 __m128d imask_SSE0, jmask_SSE0;
1235 __m128d imask_SSE1, jmask_SSE1;
1236 __m128d jx_SSE, jy_SSE, jz_SSE;
1237 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
1238 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
1239 __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1240 __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1241 __m128d raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1242 __m128d ccf_SSE0, dccf_SSE0, prod_SSE0;
1243 __m128d ccf_SSE1, dccf_SSE1, prod_SSE1;
1244 __m128d icf4_SSE0, icf6_SSE0;
1245 __m128d icf4_SSE1, icf6_SSE1;
1246 __m128d oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1247 __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1248 __m128d rai_inv_SSE0;
1249 __m128d rai_inv_SSE1;
1250 __m128d sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1251 __m128d sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1252 __m128d lij_inv_SSE0, sk2_rinv_SSE0;
1253 __m128d lij_inv_SSE1, sk2_rinv_SSE1;
1256 __m128d t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1257 __m128d t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1258 __m128d obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1259 __m128d obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1260 __m128d uij_SSE0, uij2_SSE0, uij3_SSE0;
1261 __m128d uij_SSE1, uij2_SSE1, uij3_SSE1;
1262 __m128d lij_SSE0, lij2_SSE0, lij3_SSE0;
1263 __m128d lij_SSE1, lij2_SSE1, lij3_SSE1;
1264 __m128d dlij_SSE0, diff2_SSE0, logterm_SSE0;
1265 __m128d dlij_SSE1, diff2_SSE1, logterm_SSE1;
1266 __m128d doffset_SSE, tmpSSE;
1268 natoms = mdatoms->nr;
1270 ni1 = mdatoms->homenr;
1274 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1279 genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1280 egbOBC, TRUE, TRUE, TRUE);
1281 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1284 x_align = aadata->x_align;
1285 y_align = aadata->y_align;
1286 z_align = aadata->z_align;
1288 gb_radius = aadata->gb_radius;
1289 work = aadata->work;
1290 jindex = aadata->jindex_gb;
1292 obc_param = aadata->workparam;
1294 oneeighth_SSE = _mm_set1_pd(0.125);
1295 onefourth_SSE = _mm_set1_pd(0.25);
1296 half_SSE = _mm_set1_pd(0.5);
1297 one_SSE = _mm_set1_pd(1.0);
1298 two_SSE = _mm_set1_pd(2.0);
1299 four_SSE = _mm_set1_pd(4.0);
1300 doffset_SSE = _mm_set1_pd(born->gb_doffset);
1302 for (i = 0; i < natoms; i++)
1304 x_align[i] = x[3*i];
1305 y_align[i] = x[3*i+1];
1306 z_align[i] = x[3*i+2];
1310 for (i = 0; i < natoms/2+1; i++)
1312 x_align[natoms+i] = x_align[i];
1313 y_align[natoms+i] = y_align[i];
1314 z_align[natoms+i] = z_align[i];
1317 for (i = 0; i < natoms+natoms/2+1; i++)
1322 for (i = ni0; i < ni1; i += UNROLLI)
1324 /* We assume shifts are NOT used for all-vs-all interactions */
1326 /* Load i atom data */
1327 ix_SSE0 = _mm_load1_pd(x_align+i);
1328 iy_SSE0 = _mm_load1_pd(y_align+i);
1329 iz_SSE0 = _mm_load1_pd(z_align+i);
1330 ix_SSE1 = _mm_load1_pd(x_align+i+1);
1331 iy_SSE1 = _mm_load1_pd(y_align+i+1);
1332 iz_SSE1 = _mm_load1_pd(z_align+i+1);
1334 rai_SSE0 = _mm_load1_pd(gb_radius+i);
1335 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
1336 rai_inv_SSE0 = gmx_mm_inv_pd(rai_SSE0);
1337 rai_inv_SSE1 = gmx_mm_inv_pd(rai_SSE1);
1339 sk_ai_SSE0 = _mm_load1_pd(obc_param+i);
1340 sk_ai_SSE1 = _mm_load1_pd(obc_param+i+1);
1341 sk2_ai_SSE0 = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0);
1342 sk2_ai_SSE1 = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1);
1344 sum_ai_SSE0 = _mm_setzero_pd();
1345 sum_ai_SSE1 = _mm_setzero_pd();
1347 /* Load limits for loop over neighbors */
1349 nj1 = jindex[4*i+1];
1350 nj2 = jindex[4*i+2];
1351 nj3 = jindex[4*i+3];
1353 pmask0 = aadata->prologue_mask_gb[i];
1354 pmask1 = aadata->prologue_mask_gb[i+1];
1355 emask0 = aadata->epilogue_mask[i];
1356 emask1 = aadata->epilogue_mask[i+1];
1358 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
1359 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1361 /* Prologue part, including exclusion mask */
1362 for (j = nj0; j < nj1; j += UNROLLJ)
1364 jmask_SSE0 = _mm_load_pd((double *)pmask0);
1365 jmask_SSE1 = _mm_load_pd((double *)pmask1);
1366 pmask0 += 2*UNROLLJ;
1367 pmask1 += 2*UNROLLJ;
1369 /* load j atom coordinates */
1370 jx_SSE = _mm_load_pd(x_align+j);
1371 jy_SSE = _mm_load_pd(y_align+j);
1372 jz_SSE = _mm_load_pd(z_align+j);
1374 /* Calculate distance */
1375 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1376 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1377 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1378 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1379 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1380 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1382 /* rsq = dx*dx+dy*dy+dz*dz */
1383 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1384 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1387 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1388 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1390 /* Calculate 1/r and 1/r2 */
1391 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1392 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1395 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1396 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1398 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1399 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1401 sk_aj_SSE = _mm_load_pd(obc_param+j);
1402 raj_SSE = _mm_load_pd(gb_radius+j);
1403 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1405 /* Evaluate influence of atom aj -> ai */
1406 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1407 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1408 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1409 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1410 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1411 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1413 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1414 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1415 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1416 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1417 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1418 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1419 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1420 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1422 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1423 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1424 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1425 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1426 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1427 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1428 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1429 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1431 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1432 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1433 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1434 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1435 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1436 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1437 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1438 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1440 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1441 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1442 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1443 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1444 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1445 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1446 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1447 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1448 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1450 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1451 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1453 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1454 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1455 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1456 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1458 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1459 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1462 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1463 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1464 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1465 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1466 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1467 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1468 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1469 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1470 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1471 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1473 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1474 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1476 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1477 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1478 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1479 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1480 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1481 _mm_mul_pd(onefourth_SSE,
1482 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1483 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1484 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1485 _mm_mul_pd(onefourth_SSE,
1486 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1487 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1489 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1490 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1491 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1492 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1493 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1494 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1495 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1496 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1497 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1498 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1499 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1500 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1501 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1502 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1503 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1504 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1505 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1506 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1508 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1509 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1510 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1512 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1514 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1515 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1516 _mm_add_pd(t2_SSE0, t3_SSE0)));
1517 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1518 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1519 _mm_add_pd(t2_SSE1, t3_SSE1)));
1521 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1523 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1526 /* Evaluate influence of atom ai -> aj */
1527 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1528 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1529 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1530 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1531 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1532 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1534 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1535 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1536 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1537 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1538 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1539 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1540 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1541 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1543 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1544 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1545 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1546 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1547 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1548 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1549 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1550 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1552 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1553 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1554 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1555 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1556 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1557 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1558 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1559 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1561 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1562 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1563 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1564 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1565 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1566 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1567 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1568 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1570 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1571 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1572 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1573 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1574 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1575 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1577 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1578 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1580 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1581 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1582 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1583 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1584 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1585 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1586 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1587 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1588 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1589 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1591 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1592 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1593 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1595 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1596 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1597 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1598 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1599 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1600 _mm_mul_pd(onefourth_SSE,
1601 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1602 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1603 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1604 _mm_mul_pd(onefourth_SSE,
1605 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1606 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1607 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1608 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1609 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1610 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1611 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1612 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1613 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1614 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1615 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1616 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1617 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1618 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1620 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1621 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1622 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1623 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1625 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1626 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1628 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1629 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1630 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1632 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1635 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1636 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1637 _mm_add_pd(t2_SSE0, t3_SSE0)));
1638 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1639 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1640 _mm_add_pd(t2_SSE1, t3_SSE1)));
1642 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1644 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1648 /* Main part, no exclusions */
1649 for (j = nj1; j < nj2; j += UNROLLJ)
1651 /* load j atom coordinates */
1652 jx_SSE = _mm_load_pd(x_align+j);
1653 jy_SSE = _mm_load_pd(y_align+j);
1654 jz_SSE = _mm_load_pd(z_align+j);
1656 /* Calculate distance */
1657 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1658 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1659 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1660 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1661 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1662 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1664 /* rsq = dx*dx+dy*dy+dz*dz */
1665 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1666 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1668 /* Calculate 1/r and 1/r2 */
1669 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1670 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1673 rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
1674 rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
1676 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1677 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1679 sk_aj_SSE = _mm_load_pd(obc_param+j);
1680 raj_SSE = _mm_load_pd(gb_radius+j);
1682 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1684 /* Evaluate influence of atom aj -> ai */
1685 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1686 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1687 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1688 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1689 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1690 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1692 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1693 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1694 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1695 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1696 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1697 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1698 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1699 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1701 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1702 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1703 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1704 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1705 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1706 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1707 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1708 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1710 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1711 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1712 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1713 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1714 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1715 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1716 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1717 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1719 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1720 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1721 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1722 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1723 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1724 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1725 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1726 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1727 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1729 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1730 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1732 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1733 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1734 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1735 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1737 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1738 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1741 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1742 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1743 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1744 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1745 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1746 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1747 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1748 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1749 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1750 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1752 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1753 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1755 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1756 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1757 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1758 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1760 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1761 _mm_mul_pd(onefourth_SSE,
1762 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1763 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1764 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1765 _mm_mul_pd(onefourth_SSE,
1766 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1767 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1769 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1770 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1771 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1772 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1773 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1774 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1775 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1776 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1777 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1778 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1779 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1780 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1781 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1782 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1783 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1784 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1785 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1786 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1788 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1789 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1790 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1792 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1794 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1795 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1796 _mm_add_pd(t2_SSE0, t3_SSE0)));
1797 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1798 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1799 _mm_add_pd(t2_SSE1, t3_SSE1)));
1801 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1803 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1806 /* Evaluate influence of atom ai -> aj */
1807 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1808 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1809 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1810 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1811 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1812 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1814 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1815 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1816 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1817 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1818 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1819 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1820 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1821 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1823 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1824 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1825 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1826 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1827 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1828 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1829 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1830 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1832 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1833 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1834 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1835 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1836 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1837 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1838 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1839 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1841 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1842 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1843 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1844 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1845 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1846 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1847 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1848 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1850 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1851 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1852 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1853 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1854 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1855 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1857 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1858 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1860 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1861 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1862 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1863 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1864 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1865 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1866 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1867 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1868 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1869 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1871 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1872 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1873 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1875 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1876 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1877 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1878 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1879 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1880 _mm_mul_pd(onefourth_SSE,
1881 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1882 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1883 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1884 _mm_mul_pd(onefourth_SSE,
1885 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1886 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1887 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1888 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1889 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1890 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1891 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1892 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1893 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1894 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1895 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1896 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1897 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1898 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1900 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1901 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1902 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1903 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1905 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1906 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1908 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1909 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1910 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1912 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1914 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1915 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1916 _mm_add_pd(t2_SSE0, t3_SSE0)));
1917 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1918 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1919 _mm_add_pd(t2_SSE1, t3_SSE1)));
1921 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1923 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1927 /* Epilogue part, including exclusion mask */
1928 for (j = nj2; j < nj3; j += UNROLLJ)
1930 jmask_SSE0 = _mm_load_pd((double *)emask0);
1931 jmask_SSE1 = _mm_load_pd((double *)emask1);
1932 emask0 += 2*UNROLLJ;
1933 emask1 += 2*UNROLLJ;
1935 /* load j atom coordinates */
1936 jx_SSE = _mm_load_pd(x_align+j);
1937 jy_SSE = _mm_load_pd(y_align+j);
1938 jz_SSE = _mm_load_pd(z_align+j);
1940 /* Calculate distance */
1941 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1942 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1943 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1944 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1945 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1946 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1948 /* rsq = dx*dx+dy*dy+dz*dz */
1949 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1950 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1953 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1954 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1956 /* Calculate 1/r and 1/r2 */
1957 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1958 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1961 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1962 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1964 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1965 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1967 sk_aj_SSE = _mm_load_pd(obc_param+j);
1968 raj_SSE = _mm_load_pd(gb_radius+j);
1970 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1972 /* Evaluate influence of atom aj -> ai */
1973 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1974 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1975 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1976 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1977 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1978 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1980 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1981 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1982 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1983 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1984 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1985 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1986 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1987 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1989 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1990 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1991 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1992 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1993 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1994 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1996 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1997 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1999 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
2000 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
2001 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2002 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2003 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2004 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2005 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2006 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2008 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2009 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2010 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2011 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2012 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
2013 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
2014 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
2015 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2016 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2018 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2019 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2021 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
2022 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
2023 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2024 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2026 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2027 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2030 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2031 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2032 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2033 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2034 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
2035 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
2036 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2037 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2038 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2039 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2041 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2042 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2044 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2045 _mm_mul_pd(prod_SSE0, lij3_SSE0));
2046 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2047 _mm_mul_pd(prod_SSE1, lij3_SSE1));
2048 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2049 _mm_mul_pd(onefourth_SSE,
2050 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2051 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2052 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2053 _mm_mul_pd(onefourth_SSE,
2054 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2055 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2057 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2058 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2059 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2060 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2061 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2062 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2063 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2064 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2065 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2066 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2067 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2068 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2069 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2070 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2071 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2072 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2073 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2074 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2076 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2077 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2078 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2080 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2082 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2083 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2084 _mm_add_pd(t2_SSE0, t3_SSE0)));
2085 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2086 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2087 _mm_add_pd(t2_SSE1, t3_SSE1)));
2089 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2091 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2094 /* Evaluate influence of atom ai -> aj */
2095 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
2096 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
2097 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
2098 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
2099 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
2100 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
2102 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2103 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2104 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2105 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2106 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2107 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2108 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
2109 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
2111 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
2112 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
2113 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
2114 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
2115 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
2116 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
2118 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
2119 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
2121 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
2122 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
2123 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2124 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2125 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2126 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2127 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2128 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2130 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2131 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2132 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2133 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2134 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
2135 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
2136 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2137 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2139 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2140 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2141 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
2142 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
2143 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2144 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2146 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2147 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2149 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2150 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2151 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2152 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2153 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
2154 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
2155 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2156 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2157 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2158 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2160 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2161 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
2162 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
2164 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2165 _mm_mul_pd(prod_SSE0, lij3_SSE0));
2166 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2167 _mm_mul_pd(prod_SSE1, lij3_SSE1));
2169 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2170 _mm_mul_pd(onefourth_SSE,
2171 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2172 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2173 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2174 _mm_mul_pd(onefourth_SSE,
2175 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2176 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2177 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2178 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2179 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2180 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2181 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2182 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2183 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2184 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2185 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2186 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2187 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2188 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2190 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2191 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2192 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2193 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2195 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2196 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2198 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2199 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2200 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2202 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2204 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2205 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2206 _mm_add_pd(t2_SSE0, t3_SSE0)));
2207 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2208 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2209 _mm_add_pd(t2_SSE1, t3_SSE1)));
2211 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2213 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2216 GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1);
2217 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1);
2218 _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2222 for (i = 0; i < natoms/2+1; i++)
2224 work[i] += work[natoms+i];
2227 /* Parallel summations would go here if ever implemented in DD */
2229 if (gb_algorithm == egbHCT)
2232 for (i = 0; i < natoms; i++)
2234 if (born->use[i] != 0)
2236 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
2237 sum_ai = 1.0/rai - work[i];
2238 min_rad = rai + born->gb_doffset;
2241 born->bRad[i] = rad > min_rad ? rad : min_rad;
2242 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2251 /* Calculate the radii */
2252 for (i = 0; i < natoms; i++)
2255 if (born->use[i] != 0)
2257 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2259 rai = rai-born->gb_doffset;
2261 sum_ai = rai * work[i];
2262 sum_ai2 = sum_ai * sum_ai;
2263 sum_ai3 = sum_ai2 * sum_ai;
2265 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2266 born->bRad[i] = rai_inv - tsum*rai_inv2;
2267 born->bRad[i] = 1.0 / born->bRad[i];
2269 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2271 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2272 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2288 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr,
2289 t_mdatoms * mdatoms,
2290 gmx_genborn_t * born,
2296 gmx_allvsallgb2_data_t *aadata;
2299 int nj0, nj1, nj2, nj3;
2308 double fix, fiy, fiz;
2312 double rbai, rbaj, fgb, fgb_ai, rbi;
2323 __m128d jmask_SSE0, jmask_SSE1;
2324 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
2325 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
2326 __m128d fix_SSE0, fiy_SSE0, fiz_SSE0;
2327 __m128d fix_SSE1, fiy_SSE1, fiz_SSE1;
2328 __m128d rbai_SSE0, rbai_SSE1;
2329 __m128d imask_SSE0, imask_SSE1;
2330 __m128d jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
2331 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
2332 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
2333 __m128d fgb_SSE0, fgb_ai_SSE0;
2334 __m128d fgb_SSE1, fgb_ai_SSE1;
2335 __m128d tx_SSE0, ty_SSE0, tz_SSE0;
2336 __m128d tx_SSE1, ty_SSE1, tz_SSE1;
2337 __m128d t1, t2, tmpSSE;
2339 natoms = mdatoms->nr;
2341 ni1 = mdatoms->homenr;
2343 aadata = (gmx_allvsallgb2_data_t *)paadata;
2345 x_align = aadata->x_align;
2346 y_align = aadata->y_align;
2347 z_align = aadata->z_align;
2348 fx_align = aadata->fx_align;
2349 fy_align = aadata->fy_align;
2350 fz_align = aadata->fz_align;
2352 jindex = aadata->jindex_gb;
2358 /* Loop to get the proper form for the Born radius term */
2359 if (gb_algorithm == egbSTILL)
2361 for (i = 0; i < natoms; i++)
2363 rbi = born->bRad[i];
2364 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2367 else if (gb_algorithm == egbHCT)
2369 for (i = 0; i < natoms; i++)
2371 rbi = born->bRad[i];
2372 rb[i] = rbi * rbi * fr->dvda[i];
2375 else if (gb_algorithm == egbOBC)
2377 for (idx = 0; idx < natoms; idx++)
2379 rbi = born->bRad[idx];
2380 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2384 for (i = 0; i < 2*natoms; i++)
2392 for (i = 0; i < natoms; i++)
2394 rb[i+natoms] = rb[i];
2397 for (i = ni0; i < ni1; i += UNROLLI)
2399 /* We assume shifts are NOT used for all-vs-all interactions */
2401 /* Load i atom data */
2402 ix_SSE0 = _mm_load1_pd(x_align+i);
2403 iy_SSE0 = _mm_load1_pd(y_align+i);
2404 iz_SSE0 = _mm_load1_pd(z_align+i);
2405 ix_SSE1 = _mm_load1_pd(x_align+i+1);
2406 iy_SSE1 = _mm_load1_pd(y_align+i+1);
2407 iz_SSE1 = _mm_load1_pd(z_align+i+1);
2409 fix_SSE0 = _mm_setzero_pd();
2410 fiy_SSE0 = _mm_setzero_pd();
2411 fiz_SSE0 = _mm_setzero_pd();
2412 fix_SSE1 = _mm_setzero_pd();
2413 fiy_SSE1 = _mm_setzero_pd();
2414 fiz_SSE1 = _mm_setzero_pd();
2416 rbai_SSE0 = _mm_load1_pd(rb+i);
2417 rbai_SSE1 = _mm_load1_pd(rb+i+1);
2419 /* Load limits for loop over neighbors */
2421 nj3 = jindex[4*i+3];
2423 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2424 for (j = nj0; j < nj3; j += UNROLLJ)
2426 /* load j atom coordinates */
2427 jx_SSE = _mm_load_pd(x_align+j);
2428 jy_SSE = _mm_load_pd(y_align+j);
2429 jz_SSE = _mm_load_pd(z_align+j);
2431 /* Calculate distance */
2432 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
2433 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
2434 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
2435 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
2436 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
2437 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
2439 rbaj_SSE = _mm_load_pd(rb+j);
2441 fgb_SSE0 = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx));
2443 fgb_SSE1 = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx));
2446 fgb_ai_SSE0 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2448 fgb_ai_SSE1 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2451 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2452 fgb_SSE0 = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0);
2453 fgb_SSE1 = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1);
2455 /* Calculate temporary vectorial force */
2456 tx_SSE0 = _mm_mul_pd(fgb_SSE0, dx_SSE0);
2457 ty_SSE0 = _mm_mul_pd(fgb_SSE0, dy_SSE0);
2458 tz_SSE0 = _mm_mul_pd(fgb_SSE0, dz_SSE0);
2459 tx_SSE1 = _mm_mul_pd(fgb_SSE1, dx_SSE1);
2460 ty_SSE1 = _mm_mul_pd(fgb_SSE1, dy_SSE1);
2461 tz_SSE1 = _mm_mul_pd(fgb_SSE1, dz_SSE1);
2463 /* Increment i atom force */
2464 fix_SSE0 = _mm_add_pd(fix_SSE0, tx_SSE0);
2465 fiy_SSE0 = _mm_add_pd(fiy_SSE0, ty_SSE0);
2466 fiz_SSE0 = _mm_add_pd(fiz_SSE0, tz_SSE0);
2467 fix_SSE1 = _mm_add_pd(fix_SSE1, tx_SSE1);
2468 fiy_SSE1 = _mm_add_pd(fiy_SSE1, ty_SSE1);
2469 fiz_SSE1 = _mm_add_pd(fiz_SSE1, tz_SSE1);
2471 /* Decrement j atom force */
2472 _mm_store_pd(fx_align+j,
2473 _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) ));
2474 _mm_store_pd(fy_align+j,
2475 _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) ));
2476 _mm_store_pd(fz_align+j,
2477 _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) ));
2480 /* Add i forces to mem */
2481 GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1);
2482 fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1);
2483 _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2485 GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1);
2486 fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1);
2487 _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2489 GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1);
2490 fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1);
2491 _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
2494 for (i = 0; i < natoms; i++)
2496 f[3*i] += fx_align[i] + fx_align[natoms+i];
2497 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
2498 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
2505 /* dummy variable when not using SSE */
2506 int genborn_allvsall_sse2_double_dummy;