2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team.
6 * Copyright (c) 2012, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "types/simple.h"
51 #include "genborn_allvsall.h"
54 #if 0 && defined (GMX_X86_SSE2)
56 #include <gmx_sse2_double.h>
74 int ** prologue_mask_gb;
87 gmx_allvsallgb2_data_t;
91 calc_maxoffset(int i, int natoms)
95 if ((natoms % 2) == 1)
97 /* Odd number of atoms, easy */
100 else if ((natoms % 4) == 0)
102 /* Multiple of four is hard */
107 maxoffset = natoms/2;
111 maxoffset = natoms/2-1;
118 maxoffset = natoms/2;
122 maxoffset = natoms/2-1;
131 maxoffset = natoms/2;
135 maxoffset = natoms/2-1;
143 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
154 int ni0, ni1, nj0, nj1, nj;
155 int imin, imax, iexcl;
158 int firstinteraction;
162 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
163 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
164 * whether they should interact or not.
166 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
167 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
168 * we create a jindex array with three elements per i atom: the starting point, the point to
169 * which we need to check exclusions, and the end point.
170 * This way we only have to allocate a short exclusion mask per i atom.
173 ni0 = (start/UNROLLI)*UNROLLI;
174 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
176 /* Set the interaction mask to only enable the i atoms we want to include */
177 snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH));
178 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
179 for (i = 0; i < natoms+UNROLLI; i++)
181 aadata->imask[2*i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
182 aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
185 /* Allocate memory for our modified jindex array */
186 snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
187 for (i = 0; i < 4*(natoms+UNROLLI); i++)
189 aadata->jindex_gb[i] = 0;
192 /* Create the exclusion masks for the prologue part */
193 snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
195 /* First zero everything to avoid uninitialized data */
196 for (i = 0; i < natoms+UNROLLI; i++)
198 aadata->prologue_mask_gb[i] = NULL;
201 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
202 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
204 max_excl_offset = -1;
206 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
207 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
209 /* Which atom is the first we (might) interact with? */
210 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
211 for (i = ibase; i < imax; i++)
213 /* Before exclusions, which atom is the first we (might) interact with? */
214 firstinteraction = i+1;
215 max_offset = calc_maxoffset(i, natoms);
219 for (j = 0; j < ilist[F_GB12].nr; j += 3)
221 a1 = ilist[F_GB12].iatoms[j+1];
222 a2 = ilist[F_GB12].iatoms[j+2];
237 if (k == firstinteraction)
245 for (j = 0; j < ilist[F_GB13].nr; j += 3)
247 a1 = ilist[F_GB13].iatoms[j+1];
248 a2 = ilist[F_GB13].iatoms[j+2];
263 if (k == firstinteraction)
271 for (j = 0; j < ilist[F_GB14].nr; j += 3)
273 a1 = ilist[F_GB14].iatoms[j+1];
274 a2 = ilist[F_GB14].iatoms[j+2];
288 if (k == firstinteraction)
294 imin = (firstinteraction < imin) ? firstinteraction : imin;
296 /* round down to j unrolling factor */
297 imin = (imin/UNROLLJ)*UNROLLJ;
299 for (i = ibase; i < imax; i++)
301 max_offset = calc_maxoffset(i, natoms);
305 for (j = 0; j < ilist[F_GB12].nr; j += 3)
307 a1 = ilist[F_GB12].iatoms[j+1];
308 a2 = ilist[F_GB12].iatoms[j+2];
328 if (k > i+max_offset)
335 if (k+natoms <= max_offset)
339 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
344 for (j = 0; j < ilist[F_GB13].nr; j += 3)
346 a1 = ilist[F_GB13].iatoms[j+1];
347 a2 = ilist[F_GB13].iatoms[j+2];
367 if (k > i+max_offset)
374 if (k+natoms <= max_offset)
378 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
383 for (j = 0; j < ilist[F_GB14].nr; j += 3)
385 a1 = ilist[F_GB14].iatoms[j+1];
386 a2 = ilist[F_GB14].iatoms[j+2];
406 if (k > i+max_offset)
413 if (k+natoms <= max_offset)
417 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
422 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
424 /* round up to j unrolling factor */
425 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
427 /* Set all the prologue masks length to this value (even for i>end) */
428 for (i = ibase; i < ibase+UNROLLI; i++)
430 aadata->jindex_gb[4*i] = imin;
431 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
435 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
436 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
438 for (i = ibase; i < ibase+UNROLLI; i++)
440 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
441 imin = aadata->jindex_gb[4*i];
443 /* Allocate aligned memory */
444 snew(pi, 2*(nj+2*SIMD_WIDTH));
445 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
447 max_offset = calc_maxoffset(i, natoms);
449 /* Include interactions i+1 <= j < i+maxoffset */
450 for (k = 0; k < nj; k++)
454 if ( (j > i) && (j <= i+max_offset) )
456 aadata->prologue_mask_gb[i][2*k] = 0xFFFFFFFF;
457 aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
461 aadata->prologue_mask_gb[i][2*k] = 0;
462 aadata->prologue_mask_gb[i][2*k+1] = 0;
466 /* Clear out the explicit exclusions */
471 for (j = 0; j < ilist[F_GB12].nr; j += 3)
473 a1 = ilist[F_GB12].iatoms[j+1];
474 a2 = ilist[F_GB12].iatoms[j+2];
489 if (k > i+max_offset)
495 if (k+natoms <= max_offset)
503 aadata->prologue_mask_gb[i][2*k] = 0;
504 aadata->prologue_mask_gb[i][2*k+1] = 0;
510 for (j = 0; j < ilist[F_GB13].nr; j += 3)
512 a1 = ilist[F_GB13].iatoms[j+1];
513 a2 = ilist[F_GB13].iatoms[j+2];
528 if (k > i+max_offset)
534 if (k+natoms <= max_offset)
542 aadata->prologue_mask_gb[i][2*k] = 0;
543 aadata->prologue_mask_gb[i][2*k+1] = 0;
549 for (j = 0; j < ilist[F_GB14].nr; j += 3)
551 a1 = ilist[F_GB14].iatoms[j+1];
552 a2 = ilist[F_GB14].iatoms[j+2];
567 if (k > i+max_offset)
573 if (k+natoms <= max_offset)
581 aadata->prologue_mask_gb[i][2*k] = 0;
582 aadata->prologue_mask_gb[i][2*k+1] = 0;
590 /* Construct the epilogue mask - this just contains the check for maxoffset */
591 snew(aadata->epilogue_mask, natoms+UNROLLI);
593 /* First zero everything to avoid uninitialized data */
594 for (i = 0; i < natoms+UNROLLI; i++)
596 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
597 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
598 aadata->epilogue_mask[i] = NULL;
601 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
603 /* Find the lowest index for which we need to use the epilogue */
605 max_offset = calc_maxoffset(imin, natoms);
607 imin = imin + 1 + max_offset;
609 /* Find largest index for which we need to use the epilogue */
610 imax = ibase + UNROLLI-1;
611 imax = (imax < end) ? imax : end;
613 max_offset = calc_maxoffset(imax, natoms);
614 imax = imax + 1 + max_offset + UNROLLJ - 1;
616 for (i = ibase; i < ibase+UNROLLI; i++)
618 /* Start of epilogue - round down to j tile limit */
619 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
620 /* Make sure we dont overlap - for small systems everything is done in the prologue */
621 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
622 /* Round upwards to j tile limit */
623 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
624 /* Make sure we dont have a negative range for the epilogue */
625 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
629 /* And fill it with data... */
631 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
633 for (i = ibase; i < ibase+UNROLLI; i++)
636 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
638 /* Allocate aligned memory */
639 snew(pi, 2*(nj+2*SIMD_WIDTH));
640 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
642 max_offset = calc_maxoffset(i, natoms);
644 for (k = 0; k < nj; k++)
646 j = aadata->jindex_gb[4*i+2] + k;
647 aadata->epilogue_mask[i][2*k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
648 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
656 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
657 gmx_localtop_t * top,
658 gmx_genborn_t * born,
660 double radius_offset,
668 gmx_allvsallgb2_data_t *aadata;
671 natoms = mdatoms->nr;
676 snew(p, 2*natoms+2*SIMD_WIDTH);
677 aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
678 snew(p, 2*natoms+2*SIMD_WIDTH);
679 aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
680 snew(p, 2*natoms+2*SIMD_WIDTH);
681 aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
682 snew(p, 2*natoms+2*SIMD_WIDTH);
683 aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
684 snew(p, 2*natoms+2*SIMD_WIDTH);
685 aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
686 snew(p, 2*natoms+2*SIMD_WIDTH);
687 aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
689 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
690 aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
692 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
693 aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
695 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
696 aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
698 for (i = 0; i < mdatoms->nr; i++)
700 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
701 if (gb_algorithm == egbSTILL)
703 aadata->workparam[i] = born->vsolv[i];
705 else if (gb_algorithm == egbOBC)
707 aadata->workparam[i] = born->param[i];
709 aadata->work[i] = 0.0;
711 for (i = 0; i < mdatoms->nr; i++)
713 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
714 aadata->workparam[natoms+i] = aadata->workparam[i];
715 aadata->work[natoms+i] = aadata->work[i];
718 for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
720 aadata->x_align[i] = 0.0;
721 aadata->y_align[i] = 0.0;
722 aadata->z_align[i] = 0.0;
723 aadata->fx_align[i] = 0.0;
724 aadata->fy_align[i] = 0.0;
725 aadata->fz_align[i] = 0.0;
728 setup_gb_exclusions_and_indices(aadata, top->idef.il, mdatoms->start, mdatoms->start+mdatoms->homenr, mdatoms->nr,
729 bInclude12, bInclude13, bInclude14);
734 * This routine apparently hits a compiler bug visual studio has had 'forever'.
735 * It is present both in VS2005 and VS2008, and the only way around it is to
736 * decrease optimization. We do that with at pragma, and only for MSVC, so it
737 * will not hurt any of the well-behaving and supported compilers out there.
738 * MS: Fix your compiler, it sucks like a black hole!
741 #pragma optimize("t",off)
745 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr,
747 gmx_genborn_t * born,
748 gmx_localtop_t * top,
753 gmx_allvsallgb2_data_t *aadata;
756 int nj0, nj1, nj2, nj3;
767 double gpi, rai, vai;
769 double irsq, idr4, idr6;
770 double raj, rvdw, ratio;
771 double vaj, ccf, dccf, theta, cosq;
772 double term, prod, icf4, icf6, gpi2, factor, sinq;
783 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
784 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
785 __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
786 __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
787 __m128d imask_SSE0, jmask_SSE0;
788 __m128d imask_SSE1, jmask_SSE1;
789 __m128d jx_SSE, jy_SSE, jz_SSE;
790 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
791 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
792 __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
793 __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
794 __m128d raj_SSE, vaj_SSE, prod_SSE;
795 __m128d rvdw_SSE0, ratio_SSE0;
796 __m128d rvdw_SSE1, ratio_SSE1;
797 __m128d theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
798 __m128d theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
799 __m128d ccf_SSE0, dccf_SSE0;
800 __m128d ccf_SSE1, dccf_SSE1;
801 __m128d icf4_SSE0, icf6_SSE0;
802 __m128d icf4_SSE1, icf6_SSE1;
803 __m128d half_SSE, one_SSE, two_SSE, four_SSE;
804 __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
806 natoms = mdatoms->nr;
807 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
808 ni1 = mdatoms->start+mdatoms->homenr;
812 aadata = *((gmx_allvsallgb2_data_t **)paadata);
817 genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
818 egbSTILL, FALSE, FALSE, TRUE);
819 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
822 x_align = aadata->x_align;
823 y_align = aadata->y_align;
824 z_align = aadata->z_align;
826 gb_radius = aadata->gb_radius;
827 vsolv = aadata->workparam;
829 jindex = aadata->jindex_gb;
832 still_p4_SSE = _mm_set1_pd(STILL_P4);
833 still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
834 still_pip5_SSE = _mm_set1_pd(STILL_PIP5);
835 half_SSE = _mm_set1_pd(0.5);
836 one_SSE = _mm_set1_pd(1.0);
837 two_SSE = _mm_set1_pd(2.0);
838 four_SSE = _mm_set1_pd(4.0);
840 /* This will be summed, so it has to extend to natoms + buffer */
841 for (i = 0; i < natoms+1+natoms/2; i++)
846 for (i = ni0; i < ni1+1+natoms/2; i++)
850 y_align[i] = x[3*k+1];
851 z_align[i] = x[3*k+2];
855 for (i = ni0; i < ni1; i += UNROLLI)
857 /* We assume shifts are NOT used for all-vs-all interactions */
858 /* Load i atom data */
859 ix_SSE0 = _mm_load1_pd(x_align+i);
860 iy_SSE0 = _mm_load1_pd(y_align+i);
861 iz_SSE0 = _mm_load1_pd(z_align+i);
862 ix_SSE1 = _mm_load1_pd(x_align+i+1);
863 iy_SSE1 = _mm_load1_pd(y_align+i+1);
864 iz_SSE1 = _mm_load1_pd(z_align+i+1);
866 gpi_SSE0 = _mm_setzero_pd();
867 gpi_SSE1 = _mm_setzero_pd();
869 rai_SSE0 = _mm_load1_pd(gb_radius+i);
870 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
872 prod_ai_SSE0 = _mm_set1_pd(STILL_P4*vsolv[i]);
873 prod_ai_SSE1 = _mm_set1_pd(STILL_P4*vsolv[i+1]);
875 /* Load limits for loop over neighbors */
881 pmask0 = aadata->prologue_mask_gb[i];
882 pmask1 = aadata->prologue_mask_gb[i+1];
883 emask0 = aadata->epilogue_mask[i];
884 emask1 = aadata->epilogue_mask[i+1];
886 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
887 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
889 /* Prologue part, including exclusion mask */
890 for (j = nj0; j < nj1; j += UNROLLJ)
892 jmask_SSE0 = _mm_load_pd((double *)pmask0);
893 jmask_SSE1 = _mm_load_pd((double *)pmask1);
897 /* load j atom coordinates */
898 jx_SSE = _mm_load_pd(x_align+j);
899 jy_SSE = _mm_load_pd(y_align+j);
900 jz_SSE = _mm_load_pd(z_align+j);
902 /* Calculate distance */
903 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
904 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
905 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
906 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
907 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
908 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
910 /* rsq = dx*dx+dy*dy+dz*dz */
911 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
912 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
915 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
916 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
918 /* Calculate 1/r and 1/r2 */
919 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
920 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
923 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
924 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
926 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
927 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
928 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
929 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
930 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
931 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
933 raj_SSE = _mm_load_pd(gb_radius+j);
934 vaj_SSE = _mm_load_pd(vsolv+j);
936 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
937 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
939 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
940 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
942 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
943 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
944 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
945 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
946 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
947 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
948 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
949 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
950 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
951 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
952 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
953 _mm_mul_pd(sinq_SSE0, theta_SSE0));
954 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
955 _mm_mul_pd(sinq_SSE1, theta_SSE1));
957 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
958 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
959 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
960 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
961 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
963 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
964 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
965 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
968 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
969 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
971 /* Save ai->aj and aj->ai chain rule terms */
972 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
974 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
977 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
979 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
983 /* Main part, no exclusions */
984 for (j = nj1; j < nj2; j += UNROLLJ)
987 /* load j atom coordinates */
988 jx_SSE = _mm_load_pd(x_align+j);
989 jy_SSE = _mm_load_pd(y_align+j);
990 jz_SSE = _mm_load_pd(z_align+j);
992 /* Calculate distance */
993 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
994 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
995 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
996 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
997 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
998 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1000 /* rsq = dx*dx+dy*dy+dz*dz */
1001 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1002 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1004 /* Calculate 1/r and 1/r2 */
1005 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1006 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1009 rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
1010 rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
1012 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1013 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1014 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1015 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1016 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1017 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1019 raj_SSE = _mm_load_pd(gb_radius+j);
1021 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
1022 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
1023 vaj_SSE = _mm_load_pd(vsolv+j);
1025 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1026 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1028 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1029 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1030 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1031 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1032 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1033 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1034 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1035 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1036 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
1037 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
1038 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1039 _mm_mul_pd(sinq_SSE0, theta_SSE0));
1040 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1041 _mm_mul_pd(sinq_SSE1, theta_SSE1));
1043 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE );
1044 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1045 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1046 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1047 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1049 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1050 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1051 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1053 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1054 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1056 /* Save ai->aj and aj->ai chain rule terms */
1057 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1059 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1062 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1064 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1067 /* Epilogue part, including exclusion mask */
1068 for (j = nj2; j < nj3; j += UNROLLJ)
1070 jmask_SSE0 = _mm_load_pd((double *)emask0);
1071 jmask_SSE1 = _mm_load_pd((double *)emask1);
1072 emask0 += 2*UNROLLJ;
1073 emask1 += 2*UNROLLJ;
1075 /* load j atom coordinates */
1076 jx_SSE = _mm_load_pd(x_align+j);
1077 jy_SSE = _mm_load_pd(y_align+j);
1078 jz_SSE = _mm_load_pd(z_align+j);
1080 /* Calculate distance */
1081 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1082 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1083 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1084 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1085 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1086 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1088 /* rsq = dx*dx+dy*dy+dz*dz */
1089 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1090 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1093 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1094 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1096 /* Calculate 1/r and 1/r2 */
1097 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1098 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1101 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1102 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1104 irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1105 irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1106 idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1107 idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1108 idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1109 idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1111 raj_SSE = _mm_load_pd(gb_radius+j);
1112 vaj_SSE = _mm_load_pd(vsolv+j);
1114 rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
1115 rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
1117 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1118 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1120 ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1121 ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1122 theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1123 theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1124 gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1125 gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1126 term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1127 term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1128 ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
1129 ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
1130 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1131 _mm_mul_pd(sinq_SSE0, theta_SSE0));
1132 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1133 _mm_mul_pd(sinq_SSE1, theta_SSE1));
1135 prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
1136 icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1137 icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1138 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1139 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1141 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1142 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1143 _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1145 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1146 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1148 /* Save ai->aj and aj->ai chain rule terms */
1149 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1151 _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1154 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1156 _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1159 GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1);
1160 gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1);
1161 _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1164 /* In case we have written anything beyond natoms, move it back.
1165 * Never mind that we leave stuff above natoms; that will not
1166 * be accessed later in the routine.
1167 * In principle this should be a move rather than sum, but this
1168 * way we dont have to worry about even/odd offsets...
1170 for (i = natoms; i < ni1+1+natoms/2; i++)
1172 work[i-natoms] += work[i];
1175 /* Parallel summations */
1178 gmx_sum(natoms, work, cr);
1181 factor = 0.5 * ONE_4PI_EPS0;
1182 /* Calculate the radii - should we do all atoms, or just our local ones? */
1183 for (i = 0; i < natoms; i++)
1185 if (born->use[i] != 0)
1187 gpi = born->gpol[i]+work[i];
1189 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1190 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1196 /* Reinstate MSVC optimization */
1198 #pragma optimize("",on)
1203 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr,
1204 t_mdatoms * mdatoms,
1205 gmx_genborn_t * born,
1207 gmx_localtop_t * top,
1212 gmx_allvsallgb2_data_t *aadata;
1215 int nj0, nj1, nj2, nj3;
1232 double rad, min_rad;
1233 double rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1235 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
1236 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
1237 __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1238 __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1239 __m128d imask_SSE0, jmask_SSE0;
1240 __m128d imask_SSE1, jmask_SSE1;
1241 __m128d jx_SSE, jy_SSE, jz_SSE;
1242 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
1243 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
1244 __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1245 __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1246 __m128d raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1247 __m128d ccf_SSE0, dccf_SSE0, prod_SSE0;
1248 __m128d ccf_SSE1, dccf_SSE1, prod_SSE1;
1249 __m128d icf4_SSE0, icf6_SSE0;
1250 __m128d icf4_SSE1, icf6_SSE1;
1251 __m128d oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1252 __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1253 __m128d rai_inv_SSE0;
1254 __m128d rai_inv_SSE1;
1255 __m128d sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1256 __m128d sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1257 __m128d lij_inv_SSE0, sk2_rinv_SSE0;
1258 __m128d lij_inv_SSE1, sk2_rinv_SSE1;
1261 __m128d t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1262 __m128d t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1263 __m128d obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1264 __m128d obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1265 __m128d uij_SSE0, uij2_SSE0, uij3_SSE0;
1266 __m128d uij_SSE1, uij2_SSE1, uij3_SSE1;
1267 __m128d lij_SSE0, lij2_SSE0, lij3_SSE0;
1268 __m128d lij_SSE1, lij2_SSE1, lij3_SSE1;
1269 __m128d dlij_SSE0, diff2_SSE0, logterm_SSE0;
1270 __m128d dlij_SSE1, diff2_SSE1, logterm_SSE1;
1271 __m128d doffset_SSE, tmpSSE;
1273 natoms = mdatoms->nr;
1274 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1275 ni1 = mdatoms->start+mdatoms->homenr;
1279 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1284 genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1285 egbOBC, TRUE, TRUE, TRUE);
1286 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1289 x_align = aadata->x_align;
1290 y_align = aadata->y_align;
1291 z_align = aadata->z_align;
1293 gb_radius = aadata->gb_radius;
1294 work = aadata->work;
1295 jindex = aadata->jindex_gb;
1297 obc_param = aadata->workparam;
1299 oneeighth_SSE = _mm_set1_pd(0.125);
1300 onefourth_SSE = _mm_set1_pd(0.25);
1301 half_SSE = _mm_set1_pd(0.5);
1302 one_SSE = _mm_set1_pd(1.0);
1303 two_SSE = _mm_set1_pd(2.0);
1304 four_SSE = _mm_set1_pd(4.0);
1305 doffset_SSE = _mm_set1_pd(born->gb_doffset);
1307 for (i = 0; i < natoms; i++)
1309 x_align[i] = x[3*i];
1310 y_align[i] = x[3*i+1];
1311 z_align[i] = x[3*i+2];
1315 for (i = 0; i < natoms/2+1; i++)
1317 x_align[natoms+i] = x_align[i];
1318 y_align[natoms+i] = y_align[i];
1319 z_align[natoms+i] = z_align[i];
1322 for (i = 0; i < natoms+natoms/2+1; i++)
1327 for (i = ni0; i < ni1; i += UNROLLI)
1329 /* We assume shifts are NOT used for all-vs-all interactions */
1331 /* Load i atom data */
1332 ix_SSE0 = _mm_load1_pd(x_align+i);
1333 iy_SSE0 = _mm_load1_pd(y_align+i);
1334 iz_SSE0 = _mm_load1_pd(z_align+i);
1335 ix_SSE1 = _mm_load1_pd(x_align+i+1);
1336 iy_SSE1 = _mm_load1_pd(y_align+i+1);
1337 iz_SSE1 = _mm_load1_pd(z_align+i+1);
1339 rai_SSE0 = _mm_load1_pd(gb_radius+i);
1340 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
1341 rai_inv_SSE0 = gmx_mm_inv_pd(rai_SSE0);
1342 rai_inv_SSE1 = gmx_mm_inv_pd(rai_SSE1);
1344 sk_ai_SSE0 = _mm_load1_pd(obc_param+i);
1345 sk_ai_SSE1 = _mm_load1_pd(obc_param+i+1);
1346 sk2_ai_SSE0 = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0);
1347 sk2_ai_SSE1 = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1);
1349 sum_ai_SSE0 = _mm_setzero_pd();
1350 sum_ai_SSE1 = _mm_setzero_pd();
1352 /* Load limits for loop over neighbors */
1354 nj1 = jindex[4*i+1];
1355 nj2 = jindex[4*i+2];
1356 nj3 = jindex[4*i+3];
1358 pmask0 = aadata->prologue_mask_gb[i];
1359 pmask1 = aadata->prologue_mask_gb[i+1];
1360 emask0 = aadata->epilogue_mask[i];
1361 emask1 = aadata->epilogue_mask[i+1];
1363 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
1364 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1366 /* Prologue part, including exclusion mask */
1367 for (j = nj0; j < nj1; j += UNROLLJ)
1369 jmask_SSE0 = _mm_load_pd((double *)pmask0);
1370 jmask_SSE1 = _mm_load_pd((double *)pmask1);
1371 pmask0 += 2*UNROLLJ;
1372 pmask1 += 2*UNROLLJ;
1374 /* load j atom coordinates */
1375 jx_SSE = _mm_load_pd(x_align+j);
1376 jy_SSE = _mm_load_pd(y_align+j);
1377 jz_SSE = _mm_load_pd(z_align+j);
1379 /* Calculate distance */
1380 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1381 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1382 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1383 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1384 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1385 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1387 /* rsq = dx*dx+dy*dy+dz*dz */
1388 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1389 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1392 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1393 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1395 /* Calculate 1/r and 1/r2 */
1396 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1397 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1400 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1401 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1403 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1404 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1406 sk_aj_SSE = _mm_load_pd(obc_param+j);
1407 raj_SSE = _mm_load_pd(gb_radius+j);
1408 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1410 /* Evaluate influence of atom aj -> ai */
1411 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1412 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1413 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1414 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1415 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1416 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1418 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1419 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1420 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1421 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1422 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1423 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1424 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1425 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1427 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1428 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1429 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1430 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1431 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1432 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1433 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1434 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1436 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1437 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1438 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1439 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1440 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1441 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1442 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1443 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1445 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1446 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1447 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1448 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1449 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1450 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1451 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1452 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1453 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1455 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1456 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1458 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1459 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1460 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1461 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1463 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1464 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1467 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1468 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1469 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1470 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1471 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1472 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1473 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1474 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1475 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1476 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1478 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1479 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1481 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1482 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1483 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1484 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1485 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1486 _mm_mul_pd(onefourth_SSE,
1487 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1488 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1489 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1490 _mm_mul_pd(onefourth_SSE,
1491 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1492 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1494 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1495 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1496 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1497 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1498 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1499 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1500 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1501 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1502 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1503 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1504 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1505 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1506 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1507 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1508 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1509 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1510 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1511 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1513 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1514 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1515 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1517 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1519 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1520 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1521 _mm_add_pd(t2_SSE0, t3_SSE0)));
1522 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1523 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1524 _mm_add_pd(t2_SSE1, t3_SSE1)));
1526 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1528 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1531 /* Evaluate influence of atom ai -> aj */
1532 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1533 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1534 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1535 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1536 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1537 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1539 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1540 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1541 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1542 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1543 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1544 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1545 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1546 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1548 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1549 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1550 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1551 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1552 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1553 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1554 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1555 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1557 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1558 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1559 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1560 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1561 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1562 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1563 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1564 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1566 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1567 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1568 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1569 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1570 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1571 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1572 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1573 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1575 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1576 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1577 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1578 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1579 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1580 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1582 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1583 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1585 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1586 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1587 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1588 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1589 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1590 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1591 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1592 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1593 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1594 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1596 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1597 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1598 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1600 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1601 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1602 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1603 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1604 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1605 _mm_mul_pd(onefourth_SSE,
1606 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1607 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1608 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1609 _mm_mul_pd(onefourth_SSE,
1610 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1611 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1612 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1613 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1614 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1615 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1616 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1617 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1618 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1619 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1620 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1621 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1622 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1623 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1625 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1626 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1627 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1628 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1630 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1631 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1633 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1634 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1635 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1637 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1640 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1641 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1642 _mm_add_pd(t2_SSE0, t3_SSE0)));
1643 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1644 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1645 _mm_add_pd(t2_SSE1, t3_SSE1)));
1647 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1649 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1653 /* Main part, no exclusions */
1654 for (j = nj1; j < nj2; j += UNROLLJ)
1656 /* load j atom coordinates */
1657 jx_SSE = _mm_load_pd(x_align+j);
1658 jy_SSE = _mm_load_pd(y_align+j);
1659 jz_SSE = _mm_load_pd(z_align+j);
1661 /* Calculate distance */
1662 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1663 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1664 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1665 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1666 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1667 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1669 /* rsq = dx*dx+dy*dy+dz*dz */
1670 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1671 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1673 /* Calculate 1/r and 1/r2 */
1674 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1675 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1678 rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
1679 rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
1681 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1682 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1684 sk_aj_SSE = _mm_load_pd(obc_param+j);
1685 raj_SSE = _mm_load_pd(gb_radius+j);
1687 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1689 /* Evaluate influence of atom aj -> ai */
1690 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1691 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1692 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1693 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1694 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1695 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1697 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1698 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1699 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1700 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1701 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1702 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1703 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1704 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1706 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1707 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1708 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1709 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1710 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1711 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1712 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1713 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1715 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1716 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1717 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1718 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1719 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1720 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1721 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1722 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1724 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1725 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1726 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1727 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1728 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1729 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1730 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1731 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1732 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1734 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1735 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1737 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1738 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1739 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1740 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1742 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1743 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1746 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1747 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1748 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1749 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1750 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1751 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1752 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1753 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1754 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1755 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1757 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1758 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1760 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1761 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1762 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1763 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1765 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1766 _mm_mul_pd(onefourth_SSE,
1767 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1768 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1769 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1770 _mm_mul_pd(onefourth_SSE,
1771 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1772 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1774 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1775 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1776 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1777 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1778 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1779 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1780 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1781 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1782 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1783 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1784 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1785 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1786 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1787 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1788 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1789 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1790 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1791 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1793 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1794 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1795 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1797 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1799 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1800 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1801 _mm_add_pd(t2_SSE0, t3_SSE0)));
1802 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1803 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1804 _mm_add_pd(t2_SSE1, t3_SSE1)));
1806 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1808 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1811 /* Evaluate influence of atom ai -> aj */
1812 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1813 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1814 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1815 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1816 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1817 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1819 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1820 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1821 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1822 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1823 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1824 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1825 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1826 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1828 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1829 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1830 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1831 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1832 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1833 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1834 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1835 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1837 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1838 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1839 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1840 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1841 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1842 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1843 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1844 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1846 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1847 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1848 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1849 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1850 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1851 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1852 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1853 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1855 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1856 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1857 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
1858 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
1859 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1860 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1862 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1863 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1865 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1866 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1867 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1868 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1869 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1870 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1871 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1872 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1873 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1874 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1876 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1877 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1878 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1880 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1881 _mm_mul_pd(prod_SSE0, lij3_SSE0));
1882 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1883 _mm_mul_pd(prod_SSE1, lij3_SSE1));
1884 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1885 _mm_mul_pd(onefourth_SSE,
1886 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1887 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1888 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1889 _mm_mul_pd(onefourth_SSE,
1890 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1891 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1892 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1893 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1894 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1895 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1896 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1897 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1898 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1899 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1900 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1901 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1902 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1903 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1905 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1906 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1907 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1908 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1910 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1911 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1913 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1914 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1915 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1917 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1919 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1920 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1921 _mm_add_pd(t2_SSE0, t3_SSE0)));
1922 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1923 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1924 _mm_add_pd(t2_SSE1, t3_SSE1)));
1926 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1928 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1932 /* Epilogue part, including exclusion mask */
1933 for (j = nj2; j < nj3; j += UNROLLJ)
1935 jmask_SSE0 = _mm_load_pd((double *)emask0);
1936 jmask_SSE1 = _mm_load_pd((double *)emask1);
1937 emask0 += 2*UNROLLJ;
1938 emask1 += 2*UNROLLJ;
1940 /* load j atom coordinates */
1941 jx_SSE = _mm_load_pd(x_align+j);
1942 jy_SSE = _mm_load_pd(y_align+j);
1943 jz_SSE = _mm_load_pd(z_align+j);
1945 /* Calculate distance */
1946 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
1947 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
1948 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
1949 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
1950 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
1951 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
1953 /* rsq = dx*dx+dy*dy+dz*dz */
1954 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1955 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1958 jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
1959 jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
1961 /* Calculate 1/r and 1/r2 */
1962 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1963 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1966 rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1967 rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1969 dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1970 dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1972 sk_aj_SSE = _mm_load_pd(obc_param+j);
1973 raj_SSE = _mm_load_pd(gb_radius+j);
1975 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1977 /* Evaluate influence of atom aj -> ai */
1978 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1979 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1980 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1981 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1982 t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1983 t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1985 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1986 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1987 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1988 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1989 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1990 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1991 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1992 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1994 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1995 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1996 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1997 _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1998 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1999 _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
2001 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
2002 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
2004 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
2005 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
2006 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2007 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2008 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2009 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2010 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2011 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2013 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2014 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2015 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2016 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2017 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
2018 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
2019 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
2020 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2021 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2023 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2024 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2026 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
2027 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
2028 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2029 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2031 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2032 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2035 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2036 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2037 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2038 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2039 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
2040 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
2041 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2042 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2043 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2044 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2046 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2047 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2049 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2050 _mm_mul_pd(prod_SSE0, lij3_SSE0));
2051 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2052 _mm_mul_pd(prod_SSE1, lij3_SSE1));
2053 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2054 _mm_mul_pd(onefourth_SSE,
2055 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2056 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2057 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2058 _mm_mul_pd(onefourth_SSE,
2059 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2060 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2062 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2063 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2064 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2065 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2066 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2067 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2068 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2069 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2070 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2071 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2072 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2073 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2074 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2075 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2076 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2077 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2078 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2079 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2081 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2082 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2083 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2085 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2087 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2088 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2089 _mm_add_pd(t2_SSE0, t3_SSE0)));
2090 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2091 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2092 _mm_add_pd(t2_SSE1, t3_SSE1)));
2094 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2096 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2099 /* Evaluate influence of atom ai -> aj */
2100 t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
2101 t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
2102 t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
2103 t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
2104 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
2105 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
2107 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2108 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2109 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2110 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2111 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2112 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2113 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
2114 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
2116 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
2117 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
2118 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
2119 _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
2120 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
2121 _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
2123 dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
2124 dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
2126 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
2127 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
2128 uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2129 uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2130 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2131 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2132 lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2133 lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2135 diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2136 diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2137 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2138 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2139 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
2140 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
2141 prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2142 prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2144 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2145 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2146 t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
2147 t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
2148 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2149 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2151 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2152 _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2154 t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2155 t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2156 t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2157 t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2158 t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
2159 t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
2160 t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2161 t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2162 t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2163 t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2165 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2166 _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
2167 _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
2169 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2170 _mm_mul_pd(prod_SSE0, lij3_SSE0));
2171 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2172 _mm_mul_pd(prod_SSE1, lij3_SSE1));
2174 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2175 _mm_mul_pd(onefourth_SSE,
2176 _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2177 _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2178 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2179 _mm_mul_pd(onefourth_SSE,
2180 _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2181 _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2182 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2183 _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2184 _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2185 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2186 _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2187 _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2188 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2189 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2190 _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2191 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2192 _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2193 _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2195 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2196 _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2197 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2198 _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2200 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2201 _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2203 _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2204 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2205 _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2207 _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2209 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2210 _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2211 _mm_add_pd(t2_SSE0, t3_SSE0)));
2212 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2213 _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2214 _mm_add_pd(t2_SSE1, t3_SSE1)));
2216 _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2218 _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2221 GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1);
2222 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1);
2223 _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2227 for (i = 0; i < natoms/2+1; i++)
2229 work[i] += work[natoms+i];
2232 /* Parallel summations */
2236 gmx_sum(natoms, work, cr);
2239 if (gb_algorithm == egbHCT)
2242 for (i = 0; i < natoms; i++)
2244 if (born->use[i] != 0)
2246 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
2247 sum_ai = 1.0/rai - work[i];
2248 min_rad = rai + born->gb_doffset;
2251 born->bRad[i] = rad > min_rad ? rad : min_rad;
2252 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2261 /* Calculate the radii */
2262 for (i = 0; i < natoms; i++)
2265 if (born->use[i] != 0)
2267 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2269 rai = rai-born->gb_doffset;
2271 sum_ai = rai * work[i];
2272 sum_ai2 = sum_ai * sum_ai;
2273 sum_ai3 = sum_ai2 * sum_ai;
2275 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2276 born->bRad[i] = rai_inv - tsum*rai_inv2;
2277 born->bRad[i] = 1.0 / born->bRad[i];
2279 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2281 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2282 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2298 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr,
2299 t_mdatoms * mdatoms,
2300 gmx_genborn_t * born,
2306 gmx_allvsallgb2_data_t *aadata;
2309 int nj0, nj1, nj2, nj3;
2318 double fix, fiy, fiz;
2322 double rbai, rbaj, fgb, fgb_ai, rbi;
2333 __m128d jmask_SSE0, jmask_SSE1;
2334 __m128d ix_SSE0, iy_SSE0, iz_SSE0;
2335 __m128d ix_SSE1, iy_SSE1, iz_SSE1;
2336 __m128d fix_SSE0, fiy_SSE0, fiz_SSE0;
2337 __m128d fix_SSE1, fiy_SSE1, fiz_SSE1;
2338 __m128d rbai_SSE0, rbai_SSE1;
2339 __m128d imask_SSE0, imask_SSE1;
2340 __m128d jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
2341 __m128d dx_SSE0, dy_SSE0, dz_SSE0;
2342 __m128d dx_SSE1, dy_SSE1, dz_SSE1;
2343 __m128d fgb_SSE0, fgb_ai_SSE0;
2344 __m128d fgb_SSE1, fgb_ai_SSE1;
2345 __m128d tx_SSE0, ty_SSE0, tz_SSE0;
2346 __m128d tx_SSE1, ty_SSE1, tz_SSE1;
2347 __m128d t1, t2, tmpSSE;
2349 natoms = mdatoms->nr;
2350 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
2351 ni1 = mdatoms->start+mdatoms->homenr;
2353 aadata = (gmx_allvsallgb2_data_t *)paadata;
2355 x_align = aadata->x_align;
2356 y_align = aadata->y_align;
2357 z_align = aadata->z_align;
2358 fx_align = aadata->fx_align;
2359 fy_align = aadata->fy_align;
2360 fz_align = aadata->fz_align;
2362 jindex = aadata->jindex_gb;
2368 /* Loop to get the proper form for the Born radius term */
2369 if (gb_algorithm == egbSTILL)
2371 for (i = 0; i < natoms; i++)
2373 rbi = born->bRad[i];
2374 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2377 else if (gb_algorithm == egbHCT)
2379 for (i = 0; i < natoms; i++)
2381 rbi = born->bRad[i];
2382 rb[i] = rbi * rbi * fr->dvda[i];
2385 else if (gb_algorithm == egbOBC)
2387 for (idx = 0; idx < natoms; idx++)
2389 rbi = born->bRad[idx];
2390 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2394 for (i = 0; i < 2*natoms; i++)
2402 for (i = 0; i < natoms; i++)
2404 rb[i+natoms] = rb[i];
2407 for (i = ni0; i < ni1; i += UNROLLI)
2409 /* We assume shifts are NOT used for all-vs-all interactions */
2411 /* Load i atom data */
2412 ix_SSE0 = _mm_load1_pd(x_align+i);
2413 iy_SSE0 = _mm_load1_pd(y_align+i);
2414 iz_SSE0 = _mm_load1_pd(z_align+i);
2415 ix_SSE1 = _mm_load1_pd(x_align+i+1);
2416 iy_SSE1 = _mm_load1_pd(y_align+i+1);
2417 iz_SSE1 = _mm_load1_pd(z_align+i+1);
2419 fix_SSE0 = _mm_setzero_pd();
2420 fiy_SSE0 = _mm_setzero_pd();
2421 fiz_SSE0 = _mm_setzero_pd();
2422 fix_SSE1 = _mm_setzero_pd();
2423 fiy_SSE1 = _mm_setzero_pd();
2424 fiz_SSE1 = _mm_setzero_pd();
2426 rbai_SSE0 = _mm_load1_pd(rb+i);
2427 rbai_SSE1 = _mm_load1_pd(rb+i+1);
2429 /* Load limits for loop over neighbors */
2431 nj3 = jindex[4*i+3];
2433 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2434 for (j = nj0; j < nj3; j += UNROLLJ)
2436 /* load j atom coordinates */
2437 jx_SSE = _mm_load_pd(x_align+j);
2438 jy_SSE = _mm_load_pd(y_align+j);
2439 jz_SSE = _mm_load_pd(z_align+j);
2441 /* Calculate distance */
2442 dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
2443 dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
2444 dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
2445 dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
2446 dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
2447 dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
2449 rbaj_SSE = _mm_load_pd(rb+j);
2451 fgb_SSE0 = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx));
2453 fgb_SSE1 = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx));
2456 fgb_ai_SSE0 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2458 fgb_ai_SSE1 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2461 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2462 fgb_SSE0 = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0);
2463 fgb_SSE1 = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1);
2465 /* Calculate temporary vectorial force */
2466 tx_SSE0 = _mm_mul_pd(fgb_SSE0, dx_SSE0);
2467 ty_SSE0 = _mm_mul_pd(fgb_SSE0, dy_SSE0);
2468 tz_SSE0 = _mm_mul_pd(fgb_SSE0, dz_SSE0);
2469 tx_SSE1 = _mm_mul_pd(fgb_SSE1, dx_SSE1);
2470 ty_SSE1 = _mm_mul_pd(fgb_SSE1, dy_SSE1);
2471 tz_SSE1 = _mm_mul_pd(fgb_SSE1, dz_SSE1);
2473 /* Increment i atom force */
2474 fix_SSE0 = _mm_add_pd(fix_SSE0, tx_SSE0);
2475 fiy_SSE0 = _mm_add_pd(fiy_SSE0, ty_SSE0);
2476 fiz_SSE0 = _mm_add_pd(fiz_SSE0, tz_SSE0);
2477 fix_SSE1 = _mm_add_pd(fix_SSE1, tx_SSE1);
2478 fiy_SSE1 = _mm_add_pd(fiy_SSE1, ty_SSE1);
2479 fiz_SSE1 = _mm_add_pd(fiz_SSE1, tz_SSE1);
2481 /* Decrement j atom force */
2482 _mm_store_pd(fx_align+j,
2483 _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) ));
2484 _mm_store_pd(fy_align+j,
2485 _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) ));
2486 _mm_store_pd(fz_align+j,
2487 _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) ));
2490 /* Add i forces to mem */
2491 GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1);
2492 fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1);
2493 _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2495 GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1);
2496 fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1);
2497 _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2499 GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1);
2500 fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1);
2501 _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
2504 for (i = 0; i < natoms; i++)
2506 f[3*i] += fx_align[i] + fx_align[natoms+i];
2507 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
2508 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
2515 /* dummy variable when not using SSE */
2516 int genborn_allvsall_sse2_double_dummy;