2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team.
6 * Copyright (c) 2012, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "types/simple.h"
51 #include "genborn_allvsall.h"
53 #if 0 && defined (GMX_X86_SSE2)
55 #include <gmx_sse2_single.h>
73 int ** prologue_mask_gb;
86 gmx_allvsallgb2_data_t;
90 calc_maxoffset(int i, int natoms)
94 if ((natoms % 2) == 1)
96 /* Odd number of atoms, easy */
99 else if ((natoms % 4) == 0)
101 /* Multiple of four is hard */
106 maxoffset = natoms/2;
110 maxoffset = natoms/2-1;
117 maxoffset = natoms/2;
121 maxoffset = natoms/2-1;
130 maxoffset = natoms/2;
134 maxoffset = natoms/2-1;
142 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
153 int ni0, ni1, nj0, nj1, nj;
154 int imin, imax, iexcl;
157 int firstinteraction;
161 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
162 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
163 * whether they should interact or not.
165 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
166 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
167 * we create a jindex array with three elements per i atom: the starting point, the point to
168 * which we need to check exclusions, and the end point.
169 * This way we only have to allocate a short exclusion mask per i atom.
172 ni0 = (start/UNROLLI)*UNROLLI;
173 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
175 /* Set the interaction mask to only enable the i atoms we want to include */
176 snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
177 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
178 for (i = 0; i < natoms+UNROLLI; i++)
180 aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
183 /* Allocate memory for our modified jindex array */
184 snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
185 for (i = 0; i < 4*(natoms+UNROLLI); i++)
187 aadata->jindex_gb[i] = 0;
190 /* Create the exclusion masks for the prologue part */
191 snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
193 /* First zero everything to avoid uninitialized data */
194 for (i = 0; i < natoms+UNROLLI; i++)
196 aadata->prologue_mask_gb[i] = NULL;
199 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
200 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
202 max_excl_offset = -1;
204 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
205 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
207 /* Which atom is the first we (might) interact with? */
208 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
209 for (i = ibase; i < imax; i++)
211 /* Before exclusions, which atom is the first we (might) interact with? */
212 firstinteraction = i+1;
213 max_offset = calc_maxoffset(i, natoms);
217 for (j = 0; j < ilist[F_GB12].nr; j += 3)
219 a1 = ilist[F_GB12].iatoms[j+1];
220 a2 = ilist[F_GB12].iatoms[j+2];
235 if (k == firstinteraction)
243 for (j = 0; j < ilist[F_GB13].nr; j += 3)
245 a1 = ilist[F_GB13].iatoms[j+1];
246 a2 = ilist[F_GB13].iatoms[j+2];
261 if (k == firstinteraction)
269 for (j = 0; j < ilist[F_GB14].nr; j += 3)
271 a1 = ilist[F_GB14].iatoms[j+1];
272 a2 = ilist[F_GB14].iatoms[j+2];
286 if (k == firstinteraction)
292 imin = (firstinteraction < imin) ? firstinteraction : imin;
294 /* round down to j unrolling factor */
295 imin = (imin/UNROLLJ)*UNROLLJ;
297 for (i = ibase; i < imax; i++)
299 max_offset = calc_maxoffset(i, natoms);
303 for (j = 0; j < ilist[F_GB12].nr; j += 3)
305 a1 = ilist[F_GB12].iatoms[j+1];
306 a2 = ilist[F_GB12].iatoms[j+2];
326 if (k > i+max_offset)
333 if (k+natoms <= max_offset)
337 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
342 for (j = 0; j < ilist[F_GB13].nr; j += 3)
344 a1 = ilist[F_GB13].iatoms[j+1];
345 a2 = ilist[F_GB13].iatoms[j+2];
365 if (k > i+max_offset)
372 if (k+natoms <= max_offset)
376 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
381 for (j = 0; j < ilist[F_GB14].nr; j += 3)
383 a1 = ilist[F_GB14].iatoms[j+1];
384 a2 = ilist[F_GB14].iatoms[j+2];
404 if (k > i+max_offset)
411 if (k+natoms <= max_offset)
415 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
420 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
422 /* round up to j unrolling factor */
423 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
425 /* Set all the prologue masks length to this value (even for i>end) */
426 for (i = ibase; i < ibase+UNROLLI; i++)
428 aadata->jindex_gb[4*i] = imin;
429 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
433 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
434 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
436 for (i = ibase; i < ibase+UNROLLI; i++)
438 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
439 imin = aadata->jindex_gb[4*i];
441 /* Allocate aligned memory */
442 snew(pi, nj+2*SIMD_WIDTH);
443 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
445 max_offset = calc_maxoffset(i, natoms);
447 /* Include interactions i+1 <= j < i+maxoffset */
448 for (k = 0; k < nj; k++)
452 if ( (j > i) && (j <= i+max_offset) )
454 aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
458 aadata->prologue_mask_gb[i][k] = 0;
462 /* Clear out the explicit exclusions */
467 for (j = 0; j < ilist[F_GB12].nr; j += 3)
469 a1 = ilist[F_GB12].iatoms[j+1];
470 a2 = ilist[F_GB12].iatoms[j+2];
485 if (k > i+max_offset)
491 if (k+natoms <= max_offset)
499 aadata->prologue_mask_gb[i][k] = 0;
505 for (j = 0; j < ilist[F_GB13].nr; j += 3)
507 a1 = ilist[F_GB13].iatoms[j+1];
508 a2 = ilist[F_GB13].iatoms[j+2];
523 if (k > i+max_offset)
529 if (k+natoms <= max_offset)
537 aadata->prologue_mask_gb[i][k] = 0;
543 for (j = 0; j < ilist[F_GB14].nr; j += 3)
545 a1 = ilist[F_GB14].iatoms[j+1];
546 a2 = ilist[F_GB14].iatoms[j+2];
561 if (k > i+max_offset)
567 if (k+natoms <= max_offset)
575 aadata->prologue_mask_gb[i][k] = 0;
583 /* Construct the epilogue mask - this just contains the check for maxoffset */
584 snew(aadata->epilogue_mask, natoms+UNROLLI);
586 /* First zero everything to avoid uninitialized data */
587 for (i = 0; i < natoms+UNROLLI; i++)
589 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
590 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
591 aadata->epilogue_mask[i] = NULL;
594 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
596 /* Find the lowest index for which we need to use the epilogue */
598 max_offset = calc_maxoffset(imin, natoms);
600 imin = imin + 1 + max_offset;
602 /* Find largest index for which we need to use the epilogue */
603 imax = ibase + UNROLLI-1;
604 imax = (imax < end) ? imax : end;
606 max_offset = calc_maxoffset(imax, natoms);
607 imax = imax + 1 + max_offset + UNROLLJ - 1;
609 for (i = ibase; i < ibase+UNROLLI; i++)
611 /* Start of epilogue - round down to j tile limit */
612 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
613 /* Make sure we dont overlap - for small systems everything is done in the prologue */
614 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
615 /* Round upwards to j tile limit */
616 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
617 /* Make sure we dont have a negative range for the epilogue */
618 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
622 /* And fill it with data... */
624 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
626 for (i = ibase; i < ibase+UNROLLI; i++)
629 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
631 /* Allocate aligned memory */
632 snew(pi, nj+2*SIMD_WIDTH);
633 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
635 max_offset = calc_maxoffset(i, natoms);
637 for (k = 0; k < nj; k++)
639 j = aadata->jindex_gb[4*i+2] + k;
640 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
648 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
649 gmx_localtop_t * top,
650 gmx_genborn_t * born,
660 gmx_allvsallgb2_data_t *aadata;
663 natoms = mdatoms->nr;
668 snew(p, 2*natoms+2*SIMD_WIDTH);
669 aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
670 snew(p, 2*natoms+2*SIMD_WIDTH);
671 aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
672 snew(p, 2*natoms+2*SIMD_WIDTH);
673 aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
674 snew(p, 2*natoms+2*SIMD_WIDTH);
675 aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
676 snew(p, 2*natoms+2*SIMD_WIDTH);
677 aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
678 snew(p, 2*natoms+2*SIMD_WIDTH);
679 aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
681 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
682 aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
684 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
685 aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
687 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
688 aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
690 for (i = 0; i < mdatoms->nr; i++)
692 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
693 if (gb_algorithm == egbSTILL)
695 aadata->workparam[i] = born->vsolv[i];
697 else if (gb_algorithm == egbOBC)
699 aadata->workparam[i] = born->param[i];
701 aadata->work[i] = 0.0;
703 for (i = 0; i < mdatoms->nr; i++)
705 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
706 aadata->workparam[natoms+i] = aadata->workparam[i];
707 aadata->work[natoms+i] = aadata->work[i];
710 for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
712 aadata->x_align[i] = 0.0;
713 aadata->y_align[i] = 0.0;
714 aadata->z_align[i] = 0.0;
715 aadata->fx_align[i] = 0.0;
716 aadata->fy_align[i] = 0.0;
717 aadata->fz_align[i] = 0.0;
720 setup_gb_exclusions_and_indices(aadata, top->idef.il, mdatoms->start, mdatoms->start+mdatoms->homenr, mdatoms->nr,
721 bInclude12, bInclude13, bInclude14);
726 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec * fr,
728 gmx_genborn_t * born,
729 gmx_localtop_t * top,
734 gmx_allvsallgb2_data_t *aadata;
737 int nj0, nj1, nj2, nj3;
754 real irsq, idr4, idr6;
755 real raj, rvdw, ratio;
756 real vaj, ccf, dccf, theta, cosq;
757 real term, prod, icf4, icf6, gpi2, factor, sinq;
768 __m128 ix_SSE0, iy_SSE0, iz_SSE0;
769 __m128 ix_SSE1, iy_SSE1, iz_SSE1;
770 __m128 ix_SSE2, iy_SSE2, iz_SSE2;
771 __m128 ix_SSE3, iy_SSE3, iz_SSE3;
772 __m128 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
773 __m128 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
774 __m128 gpi_SSE2, rai_SSE2, prod_ai_SSE2;
775 __m128 gpi_SSE3, rai_SSE3, prod_ai_SSE3;
776 __m128 imask_SSE0, jmask_SSE0;
777 __m128 imask_SSE1, jmask_SSE1;
778 __m128 imask_SSE2, jmask_SSE2;
779 __m128 imask_SSE3, jmask_SSE3;
780 __m128 jx_SSE, jy_SSE, jz_SSE;
781 __m128 dx_SSE0, dy_SSE0, dz_SSE0;
782 __m128 dx_SSE1, dy_SSE1, dz_SSE1;
783 __m128 dx_SSE2, dy_SSE2, dz_SSE2;
784 __m128 dx_SSE3, dy_SSE3, dz_SSE3;
785 __m128 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
786 __m128 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
787 __m128 rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
788 __m128 rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
789 __m128 raj_SSE, vaj_SSE, prod_SSE;
790 __m128 rvdw_SSE0, ratio_SSE0;
791 __m128 rvdw_SSE1, ratio_SSE1;
792 __m128 rvdw_SSE2, ratio_SSE2;
793 __m128 rvdw_SSE3, ratio_SSE3;
794 __m128 theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
795 __m128 theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
796 __m128 theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
797 __m128 theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
798 __m128 ccf_SSE0, dccf_SSE0;
799 __m128 ccf_SSE1, dccf_SSE1;
800 __m128 ccf_SSE2, dccf_SSE2;
801 __m128 ccf_SSE3, dccf_SSE3;
802 __m128 icf4_SSE0, icf6_SSE0;
803 __m128 icf4_SSE1, icf6_SSE1;
804 __m128 icf4_SSE2, icf6_SSE2;
805 __m128 icf4_SSE3, icf6_SSE3;
806 __m128 half_SSE, one_SSE, two_SSE, four_SSE;
807 __m128 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
809 natoms = mdatoms->nr;
810 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
811 ni1 = mdatoms->start+mdatoms->homenr;
815 aadata = *((gmx_allvsallgb2_data_t **)paadata);
820 genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
821 egbSTILL, FALSE, FALSE, TRUE);
822 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
825 x_align = aadata->x_align;
826 y_align = aadata->y_align;
827 z_align = aadata->z_align;
829 gb_radius = aadata->gb_radius;
830 vsolv = aadata->workparam;
832 jindex = aadata->jindex_gb;
835 still_p4_SSE = _mm_set1_ps(STILL_P4);
836 still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
837 still_pip5_SSE = _mm_set1_ps(STILL_PIP5);
838 half_SSE = _mm_set1_ps(0.5);
839 one_SSE = _mm_set1_ps(1.0);
840 two_SSE = _mm_set1_ps(2.0);
841 four_SSE = _mm_set1_ps(4.0);
843 /* This will be summed, so it has to extend to natoms + buffer */
844 for (i = 0; i < natoms+1+natoms/2; i++)
849 for (i = ni0; i < ni1+1+natoms/2; i++)
853 y_align[i] = x[3*k+1];
854 z_align[i] = x[3*k+2];
859 for (i = ni0; i < ni1; i += UNROLLI)
861 /* We assume shifts are NOT used for all-vs-all interactions */
863 /* Load i atom data */
864 ix_SSE0 = _mm_load1_ps(x_align+i);
865 iy_SSE0 = _mm_load1_ps(y_align+i);
866 iz_SSE0 = _mm_load1_ps(z_align+i);
867 ix_SSE1 = _mm_load1_ps(x_align+i+1);
868 iy_SSE1 = _mm_load1_ps(y_align+i+1);
869 iz_SSE1 = _mm_load1_ps(z_align+i+1);
870 ix_SSE2 = _mm_load1_ps(x_align+i+2);
871 iy_SSE2 = _mm_load1_ps(y_align+i+2);
872 iz_SSE2 = _mm_load1_ps(z_align+i+2);
873 ix_SSE3 = _mm_load1_ps(x_align+i+3);
874 iy_SSE3 = _mm_load1_ps(y_align+i+3);
875 iz_SSE3 = _mm_load1_ps(z_align+i+3);
877 gpi_SSE0 = _mm_setzero_ps();
878 gpi_SSE1 = _mm_setzero_ps();
879 gpi_SSE2 = _mm_setzero_ps();
880 gpi_SSE3 = _mm_setzero_ps();
882 rai_SSE0 = _mm_load1_ps(gb_radius+i);
883 rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
884 rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
885 rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
887 prod_ai_SSE0 = _mm_set1_ps(STILL_P4*vsolv[i]);
888 prod_ai_SSE1 = _mm_set1_ps(STILL_P4*vsolv[i+1]);
889 prod_ai_SSE2 = _mm_set1_ps(STILL_P4*vsolv[i+2]);
890 prod_ai_SSE3 = _mm_set1_ps(STILL_P4*vsolv[i+3]);
892 /* Load limits for loop over neighbors */
898 pmask0 = aadata->prologue_mask_gb[i];
899 pmask1 = aadata->prologue_mask_gb[i+1];
900 pmask2 = aadata->prologue_mask_gb[i+2];
901 pmask3 = aadata->prologue_mask_gb[i+3];
902 emask0 = aadata->epilogue_mask[i];
903 emask1 = aadata->epilogue_mask[i+1];
904 emask2 = aadata->epilogue_mask[i+2];
905 emask3 = aadata->epilogue_mask[i+3];
907 imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
908 imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
909 imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
910 imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
912 /* Prologue part, including exclusion mask */
913 for (j = nj0; j < nj1; j += UNROLLJ)
915 jmask_SSE0 = _mm_load_ps((real *)pmask0);
916 jmask_SSE1 = _mm_load_ps((real *)pmask1);
917 jmask_SSE2 = _mm_load_ps((real *)pmask2);
918 jmask_SSE3 = _mm_load_ps((real *)pmask3);
924 /* load j atom coordinates */
925 jx_SSE = _mm_load_ps(x_align+j);
926 jy_SSE = _mm_load_ps(y_align+j);
927 jz_SSE = _mm_load_ps(z_align+j);
929 /* Calculate distance */
930 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
931 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
932 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
933 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
934 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
935 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
936 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
937 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
938 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
939 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
940 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
941 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
943 /* rsq = dx*dx+dy*dy+dz*dz */
944 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
945 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
946 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
947 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
950 jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
951 jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
952 jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
953 jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
955 /* Calculate 1/r and 1/r2 */
956 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
957 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
958 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
959 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
962 rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
963 rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
964 rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
965 rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
967 irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
968 irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
969 irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
970 irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
971 idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
972 idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
973 idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
974 idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
975 idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
976 idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
977 idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
978 idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
980 raj_SSE = _mm_load_ps(gb_radius+j);
981 vaj_SSE = _mm_load_ps(vsolv+j);
983 rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
984 rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
985 rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
986 rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
988 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
989 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
990 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
991 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
993 ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
994 ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
995 ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
996 ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
997 theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
998 theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
999 theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1000 theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1001 gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1002 gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1003 gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1004 gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1005 term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1006 term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1007 term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1008 term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1009 ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
1010 ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
1011 ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
1012 ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
1013 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1014 _mm_mul_ps(sinq_SSE0, theta_SSE0));
1015 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1016 _mm_mul_ps(sinq_SSE1, theta_SSE1));
1017 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1018 _mm_mul_ps(sinq_SSE2, theta_SSE2));
1019 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1020 _mm_mul_ps(sinq_SSE3, theta_SSE3));
1022 prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1023 icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1024 icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1025 icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1026 icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1027 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1028 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1029 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1030 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1032 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1033 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1034 _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1035 _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1036 _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1038 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1039 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1040 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1041 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1043 /* Save ai->aj and aj->ai chain rule terms */
1044 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1046 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1048 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1050 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1053 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1055 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1057 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1059 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1063 /* Main part, no exclusions */
1064 for (j = nj1; j < nj2; j += UNROLLJ)
1066 /* load j atom coordinates */
1067 jx_SSE = _mm_load_ps(x_align+j);
1068 jy_SSE = _mm_load_ps(y_align+j);
1069 jz_SSE = _mm_load_ps(z_align+j);
1071 /* Calculate distance */
1072 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
1073 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
1074 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
1075 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
1076 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
1077 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
1078 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
1079 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
1080 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
1081 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
1082 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
1083 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
1085 /* rsq = dx*dx+dy*dy+dz*dz */
1086 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1087 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1088 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1089 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1091 /* Calculate 1/r and 1/r2 */
1092 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1093 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1094 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1095 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1098 rinv_SSE0 = _mm_and_ps(rinv_SSE0, imask_SSE0);
1099 rinv_SSE1 = _mm_and_ps(rinv_SSE1, imask_SSE1);
1100 rinv_SSE2 = _mm_and_ps(rinv_SSE2, imask_SSE2);
1101 rinv_SSE3 = _mm_and_ps(rinv_SSE3, imask_SSE3);
1103 irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1104 irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1105 irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1106 irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1107 idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1108 idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1109 idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1110 idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1111 idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1112 idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1113 idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1114 idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1116 raj_SSE = _mm_load_ps(gb_radius+j);
1118 rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
1119 rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
1120 rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
1121 rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
1122 vaj_SSE = _mm_load_ps(vsolv+j);
1124 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1125 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1126 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1127 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1129 ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1130 ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1131 ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1132 ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1133 theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1134 theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1135 theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1136 theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1137 gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1138 gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1139 gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1140 gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1141 term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1142 term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1143 term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1144 term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1145 ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
1146 ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
1147 ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
1148 ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
1149 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1150 _mm_mul_ps(sinq_SSE0, theta_SSE0));
1151 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1152 _mm_mul_ps(sinq_SSE1, theta_SSE1));
1153 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1154 _mm_mul_ps(sinq_SSE2, theta_SSE2));
1155 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1156 _mm_mul_ps(sinq_SSE3, theta_SSE3));
1158 prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE );
1159 icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1160 icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1161 icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1162 icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1163 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1164 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1165 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1166 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1168 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1169 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1170 _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1171 _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1172 _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1174 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1175 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1176 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1177 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1179 /* Save ai->aj and aj->ai chain rule terms */
1180 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1182 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1184 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1186 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1189 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1191 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1193 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1195 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1198 /* Epilogue part, including exclusion mask */
1199 for (j = nj2; j < nj3; j += UNROLLJ)
1201 jmask_SSE0 = _mm_load_ps((real *)emask0);
1202 jmask_SSE1 = _mm_load_ps((real *)emask1);
1203 jmask_SSE2 = _mm_load_ps((real *)emask2);
1204 jmask_SSE3 = _mm_load_ps((real *)emask3);
1210 /* load j atom coordinates */
1211 jx_SSE = _mm_load_ps(x_align+j);
1212 jy_SSE = _mm_load_ps(y_align+j);
1213 jz_SSE = _mm_load_ps(z_align+j);
1215 /* Calculate distance */
1216 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
1217 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
1218 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
1219 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
1220 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
1221 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
1222 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
1223 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
1224 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
1225 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
1226 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
1227 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
1229 /* rsq = dx*dx+dy*dy+dz*dz */
1230 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1231 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1232 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1233 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1236 jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
1237 jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
1238 jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
1239 jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
1241 /* Calculate 1/r and 1/r2 */
1242 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1243 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1244 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1245 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1248 rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1249 rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1250 rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1251 rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1253 irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1254 irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1255 irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1256 irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1257 idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1258 idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1259 idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1260 idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1261 idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1262 idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1263 idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1264 idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1266 raj_SSE = _mm_load_ps(gb_radius+j);
1267 vaj_SSE = _mm_load_ps(vsolv+j);
1269 rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
1270 rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
1271 rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
1272 rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
1274 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1275 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1276 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1277 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1279 ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1280 ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1281 ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1282 ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1283 theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1284 theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1285 theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1286 theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1287 gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1288 gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1289 gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1290 gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1291 term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1292 term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1293 term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1294 term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1295 ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
1296 ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
1297 ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
1298 ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
1299 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1300 _mm_mul_ps(sinq_SSE0, theta_SSE0));
1301 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1302 _mm_mul_ps(sinq_SSE1, theta_SSE1));
1303 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1304 _mm_mul_ps(sinq_SSE2, theta_SSE2));
1305 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1306 _mm_mul_ps(sinq_SSE3, theta_SSE3));
1308 prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1309 icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1310 icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1311 icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1312 icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1313 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1314 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1315 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1316 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1318 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1319 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1320 _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1321 _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1322 _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1324 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1325 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1326 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1327 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1329 /* Save ai->aj and aj->ai chain rule terms */
1330 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1332 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1334 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1336 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1339 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1341 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1343 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1345 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1348 _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
1349 gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
1350 gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
1351 gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
1352 _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1355 /* In case we have written anything beyond natoms, move it back.
1356 * Never mind that we leave stuff above natoms; that will not
1357 * be accessed later in the routine.
1358 * In principle this should be a move rather than sum, but this
1359 * way we dont have to worry about even/odd offsets...
1361 for (i = natoms; i < ni1+1+natoms/2; i++)
1363 work[i-natoms] += work[i];
1366 /* Parallel summations */
1369 gmx_sum(natoms, work, cr);
1372 factor = 0.5 * ONE_4PI_EPS0;
1373 /* Calculate the radii - should we do all atoms, or just our local ones? */
1374 for (i = 0; i < natoms; i++)
1376 if (born->use[i] != 0)
1378 gpi = born->gpol[i]+work[i];
1380 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1381 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1391 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec * fr,
1392 t_mdatoms * mdatoms,
1393 gmx_genborn_t * born,
1395 gmx_localtop_t * top,
1400 gmx_allvsallgb2_data_t *aadata;
1403 int nj0, nj1, nj2, nj3;
1425 real rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1427 __m128 ix_SSE0, iy_SSE0, iz_SSE0;
1428 __m128 ix_SSE1, iy_SSE1, iz_SSE1;
1429 __m128 ix_SSE2, iy_SSE2, iz_SSE2;
1430 __m128 ix_SSE3, iy_SSE3, iz_SSE3;
1431 __m128 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1432 __m128 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1433 __m128 gpi_SSE2, rai_SSE2, prod_ai_SSE2;
1434 __m128 gpi_SSE3, rai_SSE3, prod_ai_SSE3;
1435 __m128 imask_SSE0, jmask_SSE0;
1436 __m128 imask_SSE1, jmask_SSE1;
1437 __m128 imask_SSE2, jmask_SSE2;
1438 __m128 imask_SSE3, jmask_SSE3;
1439 __m128 jx_SSE, jy_SSE, jz_SSE;
1440 __m128 dx_SSE0, dy_SSE0, dz_SSE0;
1441 __m128 dx_SSE1, dy_SSE1, dz_SSE1;
1442 __m128 dx_SSE2, dy_SSE2, dz_SSE2;
1443 __m128 dx_SSE3, dy_SSE3, dz_SSE3;
1444 __m128 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1445 __m128 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1446 __m128 rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
1447 __m128 rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
1448 __m128 raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1449 __m128 ccf_SSE0, dccf_SSE0, prod_SSE0;
1450 __m128 ccf_SSE1, dccf_SSE1, prod_SSE1;
1451 __m128 ccf_SSE2, dccf_SSE2, prod_SSE2;
1452 __m128 ccf_SSE3, dccf_SSE3, prod_SSE3;
1453 __m128 icf4_SSE0, icf6_SSE0;
1454 __m128 icf4_SSE1, icf6_SSE1;
1455 __m128 icf4_SSE2, icf6_SSE2;
1456 __m128 icf4_SSE3, icf6_SSE3;
1457 __m128 oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1458 __m128 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1459 __m128 rai_inv_SSE0;
1460 __m128 rai_inv_SSE1;
1461 __m128 rai_inv_SSE2;
1462 __m128 rai_inv_SSE3;
1463 __m128 sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1464 __m128 sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1465 __m128 sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
1466 __m128 sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
1467 __m128 lij_inv_SSE0, sk2_rinv_SSE0;
1468 __m128 lij_inv_SSE1, sk2_rinv_SSE1;
1469 __m128 lij_inv_SSE2, sk2_rinv_SSE2;
1470 __m128 lij_inv_SSE3, sk2_rinv_SSE3;
1475 __m128 t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1476 __m128 t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1477 __m128 t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
1478 __m128 t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
1479 __m128 obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1480 __m128 obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1481 __m128 obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
1482 __m128 obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
1483 __m128 uij_SSE0, uij2_SSE0, uij3_SSE0;
1484 __m128 uij_SSE1, uij2_SSE1, uij3_SSE1;
1485 __m128 uij_SSE2, uij2_SSE2, uij3_SSE2;
1486 __m128 uij_SSE3, uij2_SSE3, uij3_SSE3;
1487 __m128 lij_SSE0, lij2_SSE0, lij3_SSE0;
1488 __m128 lij_SSE1, lij2_SSE1, lij3_SSE1;
1489 __m128 lij_SSE2, lij2_SSE2, lij3_SSE2;
1490 __m128 lij_SSE3, lij2_SSE3, lij3_SSE3;
1491 __m128 dlij_SSE0, diff2_SSE0, logterm_SSE0;
1492 __m128 dlij_SSE1, diff2_SSE1, logterm_SSE1;
1493 __m128 dlij_SSE2, diff2_SSE2, logterm_SSE2;
1494 __m128 dlij_SSE3, diff2_SSE3, logterm_SSE3;
1497 natoms = mdatoms->nr;
1498 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1499 ni1 = mdatoms->start+mdatoms->homenr;
1503 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1508 genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1509 egbOBC, TRUE, TRUE, TRUE);
1510 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1513 x_align = aadata->x_align;
1514 y_align = aadata->y_align;
1515 z_align = aadata->z_align;
1517 gb_radius = aadata->gb_radius;
1518 work = aadata->work;
1519 jindex = aadata->jindex_gb;
1521 obc_param = aadata->workparam;
1523 oneeighth_SSE = _mm_set1_ps(0.125);
1524 onefourth_SSE = _mm_set1_ps(0.25);
1525 half_SSE = _mm_set1_ps(0.5);
1526 one_SSE = _mm_set1_ps(1.0);
1527 two_SSE = _mm_set1_ps(2.0);
1528 four_SSE = _mm_set1_ps(4.0);
1529 doffset_SSE = _mm_set1_ps(born->gb_doffset);
1531 for (i = 0; i < natoms; i++)
1533 x_align[i] = x[3*i];
1534 y_align[i] = x[3*i+1];
1535 z_align[i] = x[3*i+2];
1539 for (i = 0; i < natoms/2+1; i++)
1541 x_align[natoms+i] = x_align[i];
1542 y_align[natoms+i] = y_align[i];
1543 z_align[natoms+i] = z_align[i];
1546 for (i = 0; i < natoms+natoms/2+1; i++)
1551 for (i = ni0; i < ni1; i += UNROLLI)
1553 /* We assume shifts are NOT used for all-vs-all interactions */
1555 /* Load i atom data */
1556 ix_SSE0 = _mm_load1_ps(x_align+i);
1557 iy_SSE0 = _mm_load1_ps(y_align+i);
1558 iz_SSE0 = _mm_load1_ps(z_align+i);
1559 ix_SSE1 = _mm_load1_ps(x_align+i+1);
1560 iy_SSE1 = _mm_load1_ps(y_align+i+1);
1561 iz_SSE1 = _mm_load1_ps(z_align+i+1);
1562 ix_SSE2 = _mm_load1_ps(x_align+i+2);
1563 iy_SSE2 = _mm_load1_ps(y_align+i+2);
1564 iz_SSE2 = _mm_load1_ps(z_align+i+2);
1565 ix_SSE3 = _mm_load1_ps(x_align+i+3);
1566 iy_SSE3 = _mm_load1_ps(y_align+i+3);
1567 iz_SSE3 = _mm_load1_ps(z_align+i+3);
1569 rai_SSE0 = _mm_load1_ps(gb_radius+i);
1570 rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
1571 rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
1572 rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
1573 rai_inv_SSE0 = gmx_mm_inv_ps(rai_SSE0);
1574 rai_inv_SSE1 = gmx_mm_inv_ps(rai_SSE1);
1575 rai_inv_SSE2 = gmx_mm_inv_ps(rai_SSE2);
1576 rai_inv_SSE3 = gmx_mm_inv_ps(rai_SSE3);
1578 sk_ai_SSE0 = _mm_load1_ps(obc_param+i);
1579 sk_ai_SSE1 = _mm_load1_ps(obc_param+i+1);
1580 sk_ai_SSE2 = _mm_load1_ps(obc_param+i+2);
1581 sk_ai_SSE3 = _mm_load1_ps(obc_param+i+3);
1582 sk2_ai_SSE0 = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
1583 sk2_ai_SSE1 = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
1584 sk2_ai_SSE2 = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
1585 sk2_ai_SSE3 = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
1587 sum_ai_SSE0 = _mm_setzero_ps();
1588 sum_ai_SSE1 = _mm_setzero_ps();
1589 sum_ai_SSE2 = _mm_setzero_ps();
1590 sum_ai_SSE3 = _mm_setzero_ps();
1592 /* Load limits for loop over neighbors */
1594 nj1 = jindex[4*i+1];
1595 nj2 = jindex[4*i+2];
1596 nj3 = jindex[4*i+3];
1598 pmask0 = aadata->prologue_mask_gb[i];
1599 pmask1 = aadata->prologue_mask_gb[i+1];
1600 pmask2 = aadata->prologue_mask_gb[i+2];
1601 pmask3 = aadata->prologue_mask_gb[i+3];
1602 emask0 = aadata->epilogue_mask[i];
1603 emask1 = aadata->epilogue_mask[i+1];
1604 emask2 = aadata->epilogue_mask[i+2];
1605 emask3 = aadata->epilogue_mask[i+3];
1607 imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
1608 imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
1609 imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
1610 imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
1612 /* Prologue part, including exclusion mask */
1613 for (j = nj0; j < nj1; j += UNROLLJ)
1615 jmask_SSE0 = _mm_load_ps((real *)pmask0);
1616 jmask_SSE1 = _mm_load_ps((real *)pmask1);
1617 jmask_SSE2 = _mm_load_ps((real *)pmask2);
1618 jmask_SSE3 = _mm_load_ps((real *)pmask3);
1624 /* load j atom coordinates */
1625 jx_SSE = _mm_load_ps(x_align+j);
1626 jy_SSE = _mm_load_ps(y_align+j);
1627 jz_SSE = _mm_load_ps(z_align+j);
1629 /* Calculate distance */
1630 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
1631 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
1632 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
1633 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
1634 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
1635 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
1636 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
1637 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
1638 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
1639 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
1640 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
1641 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
1643 /* rsq = dx*dx+dy*dy+dz*dz */
1644 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1645 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1646 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1647 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1650 jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
1651 jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
1652 jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
1653 jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
1655 /* Calculate 1/r and 1/r2 */
1656 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1657 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1658 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1659 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1662 rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1663 rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1664 rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1665 rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1667 dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
1668 dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
1669 dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
1670 dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
1672 sk_aj_SSE = _mm_load_ps(obc_param+j);
1673 raj_SSE = _mm_load_ps(gb_radius+j);
1674 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
1676 /* Evaluate influence of atom aj -> ai */
1677 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
1678 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
1679 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
1680 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
1681 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
1682 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
1683 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
1684 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
1685 t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
1686 t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
1687 t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
1688 t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
1690 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1691 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1692 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1693 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1694 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1695 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1696 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1697 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1698 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1699 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1700 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1701 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1702 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1703 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1704 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1705 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1707 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
1708 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
1709 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
1710 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
1711 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1712 _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
1713 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1714 _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
1715 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1716 _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
1717 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1718 _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
1719 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1720 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1721 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1722 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1724 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
1725 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
1726 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
1727 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
1728 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1729 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1730 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1731 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1732 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
1733 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
1734 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
1735 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
1736 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1737 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1738 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1739 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1741 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1742 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1743 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1744 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1745 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
1746 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
1747 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
1748 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
1749 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
1750 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
1751 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
1752 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
1753 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
1754 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1755 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1756 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1757 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1759 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1760 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1761 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1762 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1764 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
1765 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
1766 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
1767 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
1768 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
1769 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1771 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
1772 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1774 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
1775 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1777 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
1778 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1781 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1782 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
1783 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
1784 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
1785 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
1786 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
1787 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
1788 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
1789 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
1790 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
1791 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
1792 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
1793 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
1794 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
1795 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
1796 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
1797 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
1798 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
1799 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
1800 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
1802 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1803 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1804 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1805 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1807 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
1808 _mm_mul_ps(prod_SSE0, lij3_SSE0));
1809 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
1810 _mm_mul_ps(prod_SSE1, lij3_SSE1));
1811 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
1812 _mm_mul_ps(prod_SSE2, lij3_SSE2));
1813 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
1814 _mm_mul_ps(prod_SSE3, lij3_SSE3));
1815 t1_SSE0 = _mm_sub_ps(t1_SSE0,
1816 _mm_mul_ps(onefourth_SSE,
1817 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
1818 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
1819 t1_SSE1 = _mm_sub_ps(t1_SSE1,
1820 _mm_mul_ps(onefourth_SSE,
1821 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
1822 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
1823 t1_SSE2 = _mm_sub_ps(t1_SSE2,
1824 _mm_mul_ps(onefourth_SSE,
1825 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
1826 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
1827 t1_SSE3 = _mm_sub_ps(t1_SSE3,
1828 _mm_mul_ps(onefourth_SSE,
1829 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
1830 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
1832 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
1833 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
1834 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
1835 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
1836 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
1837 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
1838 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
1839 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
1840 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
1841 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
1842 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
1843 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
1844 t2_SSE0 = _mm_sub_ps(t2_SSE0,
1845 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
1846 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
1847 t2_SSE1 = _mm_sub_ps(t2_SSE1,
1848 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
1849 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
1850 t2_SSE2 = _mm_sub_ps(t2_SSE2,
1851 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
1852 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
1853 t2_SSE3 = _mm_sub_ps(t2_SSE3,
1854 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
1855 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
1856 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
1857 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
1858 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
1859 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
1860 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
1861 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
1862 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
1863 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
1864 t3_SSE0 = _mm_sub_ps(t3_SSE0,
1865 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
1867 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
1868 t3_SSE1 = _mm_sub_ps(t3_SSE1,
1869 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
1871 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
1872 t3_SSE2 = _mm_sub_ps(t3_SSE2,
1873 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
1875 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
1876 t3_SSE3 = _mm_sub_ps(t3_SSE3,
1877 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
1879 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
1881 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
1882 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
1883 _mm_add_ps(t2_SSE0, t3_SSE0)));
1884 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
1885 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
1886 _mm_add_ps(t2_SSE1, t3_SSE1)));
1887 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
1888 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
1889 _mm_add_ps(t2_SSE2, t3_SSE2)));
1890 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
1891 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
1892 _mm_add_ps(t2_SSE3, t3_SSE3)));
1894 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1896 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1898 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1900 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1903 /* Evaluate influence of atom ai -> aj */
1904 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
1905 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
1906 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
1907 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
1908 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
1909 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
1910 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
1911 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
1912 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
1913 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
1914 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
1915 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
1917 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1918 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1919 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1920 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1921 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1922 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1923 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1924 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1925 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1926 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1927 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1928 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1929 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1930 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1931 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1932 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1934 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
1935 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
1936 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
1937 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
1938 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1939 _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
1940 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1941 _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
1942 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1943 _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
1944 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1945 _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
1946 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1947 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1948 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1949 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1951 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
1952 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
1953 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
1954 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
1955 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1956 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1957 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1958 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1959 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
1960 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
1961 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
1962 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
1963 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1964 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1965 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1966 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1968 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1969 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1970 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1971 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1972 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
1973 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
1974 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
1975 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
1976 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
1977 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
1978 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
1979 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
1980 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1981 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1982 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1983 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1985 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1986 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1987 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1988 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1989 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
1990 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
1991 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
1992 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
1993 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
1994 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1996 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
1997 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1999 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2000 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2002 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2003 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2005 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2006 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2007 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2008 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2009 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2010 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2011 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2012 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2013 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2014 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2015 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2016 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2017 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2018 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2019 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2020 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2021 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2022 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2023 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2024 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2026 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2027 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2028 _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2029 _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2030 _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2032 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2033 _mm_mul_ps(prod_SSE0, lij3_SSE0));
2034 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2035 _mm_mul_ps(prod_SSE1, lij3_SSE1));
2036 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2037 _mm_mul_ps(prod_SSE2, lij3_SSE2));
2038 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2039 _mm_mul_ps(prod_SSE3, lij3_SSE3));
2040 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2041 _mm_mul_ps(onefourth_SSE,
2042 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2043 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2044 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2045 _mm_mul_ps(onefourth_SSE,
2046 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2047 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2048 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2049 _mm_mul_ps(onefourth_SSE,
2050 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2051 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2052 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2053 _mm_mul_ps(onefourth_SSE,
2054 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2055 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2056 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2057 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2058 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2059 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2060 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2061 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2062 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2063 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2064 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2065 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2066 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2067 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2068 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2069 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2070 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2071 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2072 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2073 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2074 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2075 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2076 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2077 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2078 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2079 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2081 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2082 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2083 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2084 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2085 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2086 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2087 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2088 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2090 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2091 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2093 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2094 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2095 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2097 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2098 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2099 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2101 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2102 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2103 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2105 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2108 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2109 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2110 _mm_add_ps(t2_SSE0, t3_SSE0)));
2111 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2112 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2113 _mm_add_ps(t2_SSE1, t3_SSE1)));
2114 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2115 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2116 _mm_add_ps(t2_SSE2, t3_SSE2)));
2117 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2118 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2119 _mm_add_ps(t2_SSE3, t3_SSE3)));
2121 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2123 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2125 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2127 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2132 /* Main part, no exclusions */
2133 for (j = nj1; j < nj2; j += UNROLLJ)
2135 /* load j atom coordinates */
2136 jx_SSE = _mm_load_ps(x_align+j);
2137 jy_SSE = _mm_load_ps(y_align+j);
2138 jz_SSE = _mm_load_ps(z_align+j);
2140 /* Calculate distance */
2141 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
2142 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
2143 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
2144 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
2145 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
2146 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
2147 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
2148 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
2149 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
2150 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
2151 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
2152 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
2154 /* rsq = dx*dx+dy*dy+dz*dz */
2155 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2156 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2157 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2158 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2160 /* Calculate 1/r and 1/r2 */
2161 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
2162 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
2163 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
2164 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
2167 rinv_SSE0 = _mm_and_ps(rinv_SSE0, imask_SSE0);
2168 rinv_SSE1 = _mm_and_ps(rinv_SSE1, imask_SSE1);
2169 rinv_SSE2 = _mm_and_ps(rinv_SSE2, imask_SSE2);
2170 rinv_SSE3 = _mm_and_ps(rinv_SSE3, imask_SSE3);
2172 dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2173 dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2174 dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2175 dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2177 sk_aj_SSE = _mm_load_ps(obc_param+j);
2178 raj_SSE = _mm_load_ps(gb_radius+j);
2180 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
2182 /* Evaluate influence of atom aj -> ai */
2183 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2184 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2185 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2186 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2187 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2188 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2189 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2190 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2191 t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2192 t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2193 t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2194 t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2196 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2197 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2198 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2199 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2200 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2201 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2202 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2203 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2204 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2205 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2206 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2207 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2208 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2209 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2210 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2211 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2213 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2214 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2215 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2216 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2217 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2218 _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2219 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2220 _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2221 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2222 _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2223 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2224 _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2225 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2226 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2227 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2228 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2230 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2231 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2232 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2233 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2234 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2235 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2236 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2237 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2238 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2239 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2240 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2241 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2242 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2243 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2244 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2245 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2247 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2248 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2249 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2250 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2251 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2252 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2253 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2254 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2255 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2256 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2257 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2258 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2259 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2260 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2261 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2262 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2263 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2265 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2266 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2267 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2268 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2270 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
2271 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
2272 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
2273 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
2274 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2275 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2277 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2278 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2280 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2281 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2283 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2284 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2287 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2288 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2289 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2290 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2291 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2292 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2293 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2294 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2295 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2296 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2297 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2298 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2299 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2300 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2301 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2302 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2303 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2304 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2305 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2306 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2308 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2309 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2310 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2311 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2313 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2314 _mm_mul_ps(prod_SSE0, lij3_SSE0));
2315 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2316 _mm_mul_ps(prod_SSE1, lij3_SSE1));
2317 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2318 _mm_mul_ps(prod_SSE2, lij3_SSE2));
2319 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2320 _mm_mul_ps(prod_SSE3, lij3_SSE3));
2321 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2322 _mm_mul_ps(onefourth_SSE,
2323 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2324 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2325 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2326 _mm_mul_ps(onefourth_SSE,
2327 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2328 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2329 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2330 _mm_mul_ps(onefourth_SSE,
2331 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2332 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2333 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2334 _mm_mul_ps(onefourth_SSE,
2335 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2336 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2338 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2339 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2340 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2341 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2342 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2343 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2344 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2345 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2346 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2347 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2348 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2349 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2350 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2351 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2352 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2353 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2354 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2355 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2356 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2357 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2358 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2359 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2360 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2361 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2362 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2363 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2364 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2365 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2366 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2367 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2368 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2369 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2370 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2371 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2373 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2374 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2375 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2377 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2378 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2379 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2381 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2382 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2383 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2385 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2387 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2388 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2389 _mm_add_ps(t2_SSE0, t3_SSE0)));
2390 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2391 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2392 _mm_add_ps(t2_SSE1, t3_SSE1)));
2393 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2394 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2395 _mm_add_ps(t2_SSE2, t3_SSE2)));
2396 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2397 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2398 _mm_add_ps(t2_SSE3, t3_SSE3)));
2400 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2402 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2404 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2406 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2409 /* Evaluate influence of atom ai -> aj */
2410 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2411 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2412 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2413 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2414 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2415 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2416 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2417 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2418 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2419 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2420 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2421 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2423 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2424 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2425 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2426 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2427 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2428 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2429 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2430 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2431 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2432 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2433 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2434 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2435 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2436 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2437 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2438 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2440 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2441 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2442 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2443 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2444 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2445 _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2446 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2447 _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2448 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2449 _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2450 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2451 _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2452 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2453 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2454 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2455 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2457 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2458 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2459 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2460 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2461 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2462 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2463 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2464 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2465 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2466 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2467 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2468 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2469 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2470 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2471 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2472 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2474 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2475 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2476 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2477 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2478 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2479 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2480 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2481 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2482 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2483 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2484 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2485 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2486 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2487 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2488 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2489 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2491 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2492 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2493 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2494 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2495 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
2496 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
2497 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
2498 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
2499 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2500 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2502 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2503 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2505 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2506 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2508 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2509 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2511 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2512 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2513 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2514 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2515 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2516 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2517 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2518 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2519 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2520 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2521 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2522 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2523 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2524 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2525 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2526 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2527 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2528 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2529 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2530 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2532 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2533 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2534 _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2535 _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2536 _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2538 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2539 _mm_mul_ps(prod_SSE0, lij3_SSE0));
2540 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2541 _mm_mul_ps(prod_SSE1, lij3_SSE1));
2542 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2543 _mm_mul_ps(prod_SSE2, lij3_SSE2));
2544 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2545 _mm_mul_ps(prod_SSE3, lij3_SSE3));
2546 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2547 _mm_mul_ps(onefourth_SSE,
2548 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2549 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2550 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2551 _mm_mul_ps(onefourth_SSE,
2552 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2553 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2554 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2555 _mm_mul_ps(onefourth_SSE,
2556 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2557 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2558 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2559 _mm_mul_ps(onefourth_SSE,
2560 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2561 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2562 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2563 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2564 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2565 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2566 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2567 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2568 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2569 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2570 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2571 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2572 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2573 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2574 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2575 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2576 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2577 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2578 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2579 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2580 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2581 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2582 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2583 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2584 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2585 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2587 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2588 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2589 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2590 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2591 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2592 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2593 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2594 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2596 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2597 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2599 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2600 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2601 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2603 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2604 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2605 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2607 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2608 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2609 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2611 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2613 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2614 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2615 _mm_add_ps(t2_SSE0, t3_SSE0)));
2616 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2617 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2618 _mm_add_ps(t2_SSE1, t3_SSE1)));
2619 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2620 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2621 _mm_add_ps(t2_SSE2, t3_SSE2)));
2622 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2623 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2624 _mm_add_ps(t2_SSE3, t3_SSE3)));
2626 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2628 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2630 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2632 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2636 /* Epilogue part, including exclusion mask */
2637 for (j = nj2; j < nj3; j += UNROLLJ)
2639 jmask_SSE0 = _mm_load_ps((real *)emask0);
2640 jmask_SSE1 = _mm_load_ps((real *)emask1);
2641 jmask_SSE2 = _mm_load_ps((real *)emask2);
2642 jmask_SSE3 = _mm_load_ps((real *)emask3);
2648 /* load j atom coordinates */
2649 jx_SSE = _mm_load_ps(x_align+j);
2650 jy_SSE = _mm_load_ps(y_align+j);
2651 jz_SSE = _mm_load_ps(z_align+j);
2653 /* Calculate distance */
2654 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
2655 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
2656 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
2657 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
2658 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
2659 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
2660 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
2661 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
2662 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
2663 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
2664 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
2665 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
2667 /* rsq = dx*dx+dy*dy+dz*dz */
2668 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2669 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2670 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2671 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2674 jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
2675 jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
2676 jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
2677 jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
2679 /* Calculate 1/r and 1/r2 */
2680 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
2681 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
2682 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
2683 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
2686 rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
2687 rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
2688 rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
2689 rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
2691 dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2692 dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2693 dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2694 dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2696 sk_aj_SSE = _mm_load_ps(obc_param+j);
2697 raj_SSE = _mm_load_ps(gb_radius+j);
2699 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
2701 /* Evaluate influence of atom aj -> ai */
2702 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2703 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2704 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2705 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2706 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2707 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2708 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2709 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2710 t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2711 t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2712 t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2713 t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2715 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2716 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2717 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2718 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2719 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2720 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2721 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2722 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2723 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2724 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2725 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2726 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2727 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2728 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2729 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2730 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2732 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2733 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2734 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2735 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2736 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2737 _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2738 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2739 _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2740 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2741 _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2742 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2743 _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2744 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2745 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2746 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2747 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2749 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2750 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2751 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2752 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2753 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2754 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2755 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2756 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2757 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2758 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2759 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2760 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2761 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2762 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2763 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2764 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2766 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2767 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2768 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2769 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2770 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2771 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2772 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2773 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2774 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2775 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2776 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2777 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2778 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2779 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2780 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2781 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2782 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2784 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2785 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2786 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2787 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2789 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
2790 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
2791 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
2792 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
2793 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2794 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2796 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2797 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2799 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2800 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2802 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2803 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2806 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2807 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2808 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2809 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2810 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2811 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2812 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2813 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2814 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2815 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2816 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2817 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2818 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2819 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2820 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2821 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2822 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2823 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2824 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2825 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2827 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2828 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2829 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2830 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2832 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2833 _mm_mul_ps(prod_SSE0, lij3_SSE0));
2834 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2835 _mm_mul_ps(prod_SSE1, lij3_SSE1));
2836 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2837 _mm_mul_ps(prod_SSE2, lij3_SSE2));
2838 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2839 _mm_mul_ps(prod_SSE3, lij3_SSE3));
2840 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2841 _mm_mul_ps(onefourth_SSE,
2842 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2843 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2844 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2845 _mm_mul_ps(onefourth_SSE,
2846 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2847 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2848 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2849 _mm_mul_ps(onefourth_SSE,
2850 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2851 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2852 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2853 _mm_mul_ps(onefourth_SSE,
2854 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2855 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2857 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2858 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2859 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2860 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2861 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2862 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2863 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2864 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2865 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2866 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2867 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2868 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2869 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2870 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2871 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2872 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2873 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2874 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2875 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2876 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2877 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2878 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2879 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2880 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2881 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2882 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2883 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2884 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2885 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2886 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2887 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2888 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2889 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2890 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2892 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2893 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2894 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2896 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2897 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2898 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2900 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2901 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2902 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2904 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2906 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2907 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2908 _mm_add_ps(t2_SSE0, t3_SSE0)));
2909 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2910 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2911 _mm_add_ps(t2_SSE1, t3_SSE1)));
2912 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2913 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2914 _mm_add_ps(t2_SSE2, t3_SSE2)));
2915 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2916 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2917 _mm_add_ps(t2_SSE3, t3_SSE3)));
2919 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2921 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2923 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2925 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2928 /* Evaluate influence of atom ai -> aj */
2929 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2930 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2931 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2932 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2933 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2934 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2935 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2936 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2937 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2938 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2939 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2940 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2942 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2943 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2944 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2945 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2946 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2947 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2948 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2949 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2950 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2951 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2952 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2953 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2954 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2955 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2956 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2957 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2959 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2960 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2961 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2962 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2963 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2964 _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2965 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2966 _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2967 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2968 _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2969 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2970 _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2971 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2972 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2973 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2974 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2976 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2977 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2978 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2979 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2980 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2981 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2982 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2983 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2984 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2985 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2986 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2987 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2988 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2989 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2990 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2991 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2993 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2994 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2995 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2996 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2997 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2998 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2999 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
3000 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
3001 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
3002 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
3003 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
3004 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
3005 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
3006 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
3007 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
3008 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
3010 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
3011 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
3012 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
3013 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
3014 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
3015 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
3016 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
3017 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
3018 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
3019 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
3021 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
3022 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
3024 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
3025 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
3027 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
3028 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
3030 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
3031 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
3032 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
3033 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
3034 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
3035 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
3036 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
3037 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
3038 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
3039 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
3040 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
3041 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
3042 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
3043 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
3044 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
3045 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
3046 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
3047 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
3048 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
3049 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
3051 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3052 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
3053 _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
3054 _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
3055 _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
3057 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
3058 _mm_mul_ps(prod_SSE0, lij3_SSE0));
3059 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
3060 _mm_mul_ps(prod_SSE1, lij3_SSE1));
3061 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
3062 _mm_mul_ps(prod_SSE2, lij3_SSE2));
3063 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
3064 _mm_mul_ps(prod_SSE3, lij3_SSE3));
3065 t1_SSE0 = _mm_sub_ps(t1_SSE0,
3066 _mm_mul_ps(onefourth_SSE,
3067 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
3068 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
3069 t1_SSE1 = _mm_sub_ps(t1_SSE1,
3070 _mm_mul_ps(onefourth_SSE,
3071 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
3072 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
3073 t1_SSE2 = _mm_sub_ps(t1_SSE2,
3074 _mm_mul_ps(onefourth_SSE,
3075 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
3076 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
3077 t1_SSE3 = _mm_sub_ps(t1_SSE3,
3078 _mm_mul_ps(onefourth_SSE,
3079 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
3080 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
3081 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
3082 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
3083 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
3084 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
3085 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
3086 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
3087 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
3088 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
3089 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
3090 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
3091 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
3092 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
3093 t2_SSE0 = _mm_sub_ps(t2_SSE0,
3094 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
3095 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
3096 t2_SSE1 = _mm_sub_ps(t2_SSE1,
3097 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
3098 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
3099 t2_SSE2 = _mm_sub_ps(t2_SSE2,
3100 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
3101 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
3102 t2_SSE3 = _mm_sub_ps(t2_SSE3,
3103 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
3104 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
3106 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
3107 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
3108 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
3109 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
3110 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
3111 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
3112 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
3113 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
3115 t3_SSE0 = _mm_sub_ps(t3_SSE0,
3116 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
3118 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
3119 t3_SSE1 = _mm_sub_ps(t3_SSE1,
3120 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
3122 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
3123 t3_SSE2 = _mm_sub_ps(t3_SSE2,
3124 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
3126 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
3127 t3_SSE3 = _mm_sub_ps(t3_SSE3,
3128 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
3130 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
3133 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
3134 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
3135 _mm_add_ps(t2_SSE0, t3_SSE0)));
3136 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
3137 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
3138 _mm_add_ps(t2_SSE1, t3_SSE1)));
3139 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
3140 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
3141 _mm_add_ps(t2_SSE2, t3_SSE2)));
3142 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
3143 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
3144 _mm_add_ps(t2_SSE3, t3_SSE3)));
3146 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
3148 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
3150 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
3152 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
3155 _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
3156 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
3157 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
3158 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
3159 _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3163 for (i = 0; i < natoms/2+1; i++)
3165 work[i] += work[natoms+i];
3168 /* Parallel summations */
3172 gmx_sum(natoms, work, cr);
3175 if (gb_algorithm == egbHCT)
3178 for (i = 0; i < natoms; i++)
3180 if (born->use[i] != 0)
3182 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
3183 sum_ai = 1.0/rai - work[i];
3184 min_rad = rai + born->gb_doffset;
3187 born->bRad[i] = rad > min_rad ? rad : min_rad;
3188 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3197 /* Calculate the radii */
3198 for (i = 0; i < natoms; i++)
3201 if (born->use[i] != 0)
3203 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3205 rai = rai-born->gb_doffset;
3207 sum_ai = rai * work[i];
3208 sum_ai2 = sum_ai * sum_ai;
3209 sum_ai3 = sum_ai2 * sum_ai;
3211 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3212 born->bRad[i] = rai_inv - tsum*rai_inv2;
3213 born->bRad[i] = 1.0 / born->bRad[i];
3215 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3217 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3218 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3234 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec * fr,
3235 t_mdatoms * mdatoms,
3236 gmx_genborn_t * born,
3242 gmx_allvsallgb2_data_t *aadata;
3245 int nj0, nj1, nj2, nj3;
3258 real rbai, rbaj, fgb, fgb_ai, rbi;
3269 __m128 jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
3270 __m128 ix_SSE0, iy_SSE0, iz_SSE0;
3271 __m128 ix_SSE1, iy_SSE1, iz_SSE1;
3272 __m128 ix_SSE2, iy_SSE2, iz_SSE2;
3273 __m128 ix_SSE3, iy_SSE3, iz_SSE3;
3274 __m128 fix_SSE0, fiy_SSE0, fiz_SSE0;
3275 __m128 fix_SSE1, fiy_SSE1, fiz_SSE1;
3276 __m128 fix_SSE2, fiy_SSE2, fiz_SSE2;
3277 __m128 fix_SSE3, fiy_SSE3, fiz_SSE3;
3278 __m128 rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
3279 __m128 imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
3280 __m128 jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
3281 __m128 dx_SSE0, dy_SSE0, dz_SSE0;
3282 __m128 dx_SSE1, dy_SSE1, dz_SSE1;
3283 __m128 dx_SSE2, dy_SSE2, dz_SSE2;
3284 __m128 dx_SSE3, dy_SSE3, dz_SSE3;
3285 __m128 fgb_SSE0, fgb_ai_SSE0;
3286 __m128 fgb_SSE1, fgb_ai_SSE1;
3287 __m128 fgb_SSE2, fgb_ai_SSE2;
3288 __m128 fgb_SSE3, fgb_ai_SSE3;
3289 __m128 tx_SSE0, ty_SSE0, tz_SSE0;
3290 __m128 tx_SSE1, ty_SSE1, tz_SSE1;
3291 __m128 tx_SSE2, ty_SSE2, tz_SSE2;
3292 __m128 tx_SSE3, ty_SSE3, tz_SSE3;
3295 natoms = mdatoms->nr;
3296 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
3297 ni1 = mdatoms->start+mdatoms->homenr;
3300 aadata = (gmx_allvsallgb2_data_t *)paadata;
3302 x_align = aadata->x_align;
3303 y_align = aadata->y_align;
3304 z_align = aadata->z_align;
3305 fx_align = aadata->fx_align;
3306 fy_align = aadata->fy_align;
3307 fz_align = aadata->fz_align;
3309 jindex = aadata->jindex_gb;
3315 /* Loop to get the proper form for the Born radius term */
3316 if (gb_algorithm == egbSTILL)
3318 for (i = 0; i < natoms; i++)
3320 rbi = born->bRad[i];
3321 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3324 else if (gb_algorithm == egbHCT)
3326 for (i = 0; i < natoms; i++)
3328 rbi = born->bRad[i];
3329 rb[i] = rbi * rbi * fr->dvda[i];
3332 else if (gb_algorithm == egbOBC)
3334 for (idx = 0; idx < natoms; idx++)
3336 rbi = born->bRad[idx];
3337 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3341 for (i = 0; i < 2*natoms; i++)
3349 for (i = 0; i < natoms; i++)
3351 rb[i+natoms] = rb[i];
3354 for (i = ni0; i < ni1; i += UNROLLI)
3356 /* We assume shifts are NOT used for all-vs-all interactions */
3358 /* Load i atom data */
3359 ix_SSE0 = _mm_load1_ps(x_align+i);
3360 iy_SSE0 = _mm_load1_ps(y_align+i);
3361 iz_SSE0 = _mm_load1_ps(z_align+i);
3362 ix_SSE1 = _mm_load1_ps(x_align+i+1);
3363 iy_SSE1 = _mm_load1_ps(y_align+i+1);
3364 iz_SSE1 = _mm_load1_ps(z_align+i+1);
3365 ix_SSE2 = _mm_load1_ps(x_align+i+2);
3366 iy_SSE2 = _mm_load1_ps(y_align+i+2);
3367 iz_SSE2 = _mm_load1_ps(z_align+i+2);
3368 ix_SSE3 = _mm_load1_ps(x_align+i+3);
3369 iy_SSE3 = _mm_load1_ps(y_align+i+3);
3370 iz_SSE3 = _mm_load1_ps(z_align+i+3);
3372 fix_SSE0 = _mm_setzero_ps();
3373 fiy_SSE0 = _mm_setzero_ps();
3374 fiz_SSE0 = _mm_setzero_ps();
3375 fix_SSE1 = _mm_setzero_ps();
3376 fiy_SSE1 = _mm_setzero_ps();
3377 fiz_SSE1 = _mm_setzero_ps();
3378 fix_SSE2 = _mm_setzero_ps();
3379 fiy_SSE2 = _mm_setzero_ps();
3380 fiz_SSE2 = _mm_setzero_ps();
3381 fix_SSE3 = _mm_setzero_ps();
3382 fiy_SSE3 = _mm_setzero_ps();
3383 fiz_SSE3 = _mm_setzero_ps();
3385 rbai_SSE0 = _mm_load1_ps(rb+i);
3386 rbai_SSE1 = _mm_load1_ps(rb+i+1);
3387 rbai_SSE2 = _mm_load1_ps(rb+i+2);
3388 rbai_SSE3 = _mm_load1_ps(rb+i+3);
3390 /* Load limits for loop over neighbors */
3392 nj3 = jindex[4*i+3];
3394 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3395 for (j = nj0; j < nj3; j += UNROLLJ)
3397 /* load j atom coordinates */
3398 jx_SSE = _mm_load_ps(x_align+j);
3399 jy_SSE = _mm_load_ps(y_align+j);
3400 jz_SSE = _mm_load_ps(z_align+j);
3402 /* Calculate distance */
3403 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
3404 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
3405 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
3406 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
3407 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
3408 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
3409 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
3410 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
3411 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
3412 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
3413 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
3414 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
3416 rbaj_SSE = _mm_load_ps(rb+j);
3418 fgb_SSE0 = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
3420 fgb_SSE1 = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
3422 fgb_SSE2 = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
3424 fgb_SSE3 = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
3427 fgb_ai_SSE0 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3429 fgb_ai_SSE1 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3431 fgb_ai_SSE2 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3433 fgb_ai_SSE3 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3436 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3437 fgb_SSE0 = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
3438 fgb_SSE1 = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
3439 fgb_SSE2 = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
3440 fgb_SSE3 = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
3442 /* Calculate temporary vectorial force */
3443 tx_SSE0 = _mm_mul_ps(fgb_SSE0, dx_SSE0);
3444 ty_SSE0 = _mm_mul_ps(fgb_SSE0, dy_SSE0);
3445 tz_SSE0 = _mm_mul_ps(fgb_SSE0, dz_SSE0);
3446 tx_SSE1 = _mm_mul_ps(fgb_SSE1, dx_SSE1);
3447 ty_SSE1 = _mm_mul_ps(fgb_SSE1, dy_SSE1);
3448 tz_SSE1 = _mm_mul_ps(fgb_SSE1, dz_SSE1);
3449 tx_SSE2 = _mm_mul_ps(fgb_SSE2, dx_SSE2);
3450 ty_SSE2 = _mm_mul_ps(fgb_SSE2, dy_SSE2);
3451 tz_SSE2 = _mm_mul_ps(fgb_SSE2, dz_SSE2);
3452 tx_SSE3 = _mm_mul_ps(fgb_SSE3, dx_SSE3);
3453 ty_SSE3 = _mm_mul_ps(fgb_SSE3, dy_SSE3);
3454 tz_SSE3 = _mm_mul_ps(fgb_SSE3, dz_SSE3);
3456 /* Increment i atom force */
3457 fix_SSE0 = _mm_add_ps(fix_SSE0, tx_SSE0);
3458 fiy_SSE0 = _mm_add_ps(fiy_SSE0, ty_SSE0);
3459 fiz_SSE0 = _mm_add_ps(fiz_SSE0, tz_SSE0);
3460 fix_SSE1 = _mm_add_ps(fix_SSE1, tx_SSE1);
3461 fiy_SSE1 = _mm_add_ps(fiy_SSE1, ty_SSE1);
3462 fiz_SSE1 = _mm_add_ps(fiz_SSE1, tz_SSE1);
3463 fix_SSE2 = _mm_add_ps(fix_SSE2, tx_SSE2);
3464 fiy_SSE2 = _mm_add_ps(fiy_SSE2, ty_SSE2);
3465 fiz_SSE2 = _mm_add_ps(fiz_SSE2, tz_SSE2);
3466 fix_SSE3 = _mm_add_ps(fix_SSE3, tx_SSE3);
3467 fiy_SSE3 = _mm_add_ps(fiy_SSE3, ty_SSE3);
3468 fiz_SSE3 = _mm_add_ps(fiz_SSE3, tz_SSE3);
3470 /* Decrement j atom force */
3471 _mm_store_ps(fx_align+j,
3472 _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
3473 _mm_store_ps(fy_align+j,
3474 _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
3475 _mm_store_ps(fz_align+j,
3476 _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
3478 /* Add i forces to mem and shifted force list */
3479 _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
3480 fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
3481 fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
3482 fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
3483 _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3485 _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
3486 fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
3487 fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
3488 fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
3489 _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3491 _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
3492 fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
3493 fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
3494 fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
3495 _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
3498 for (i = 0; i < natoms; i++)
3500 f[3*i] += fx_align[i] + fx_align[natoms+i];
3501 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
3502 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
3509 /* dummy variable when not using SSE */
3510 int genborn_allvsall_sse2_single_dummy;