2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team.
6 * Copyright (c) 2012,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
41 #include "gromacs/legacyheaders/genborn.h"
42 #include "gromacs/legacyheaders/network.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/units.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/genborn_allvsall.h"
47 #include "gromacs/utility/smalloc.h"
49 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
51 #include <gmx_sse2_single.h>
69 int ** prologue_mask_gb;
82 gmx_allvsallgb2_data_t;
86 calc_maxoffset(int i, int natoms)
90 if ((natoms % 2) == 1)
92 /* Odd number of atoms, easy */
95 else if ((natoms % 4) == 0)
97 /* Multiple of four is hard */
102 maxoffset = natoms/2;
106 maxoffset = natoms/2-1;
113 maxoffset = natoms/2;
117 maxoffset = natoms/2-1;
126 maxoffset = natoms/2;
130 maxoffset = natoms/2-1;
138 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
149 int ni0, ni1, nj0, nj1, nj;
150 int imin, imax, iexcl;
153 int firstinteraction;
157 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
158 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
159 * whether they should interact or not.
161 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
162 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
163 * we create a jindex array with three elements per i atom: the starting point, the point to
164 * which we need to check exclusions, and the end point.
165 * This way we only have to allocate a short exclusion mask per i atom.
168 ni0 = (start/UNROLLI)*UNROLLI;
169 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
171 /* Set the interaction mask to only enable the i atoms we want to include */
172 snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
173 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
174 for (i = 0; i < natoms+UNROLLI; i++)
176 aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
179 /* Allocate memory for our modified jindex array */
180 snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
181 for (i = 0; i < 4*(natoms+UNROLLI); i++)
183 aadata->jindex_gb[i] = 0;
186 /* Create the exclusion masks for the prologue part */
187 snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
189 /* First zero everything to avoid uninitialized data */
190 for (i = 0; i < natoms+UNROLLI; i++)
192 aadata->prologue_mask_gb[i] = NULL;
195 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
196 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
198 max_excl_offset = -1;
200 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
201 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
203 /* Which atom is the first we (might) interact with? */
204 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
205 for (i = ibase; i < imax; i++)
207 /* Before exclusions, which atom is the first we (might) interact with? */
208 firstinteraction = i+1;
209 max_offset = calc_maxoffset(i, natoms);
213 for (j = 0; j < ilist[F_GB12].nr; j += 3)
215 a1 = ilist[F_GB12].iatoms[j+1];
216 a2 = ilist[F_GB12].iatoms[j+2];
231 if (k == firstinteraction)
239 for (j = 0; j < ilist[F_GB13].nr; j += 3)
241 a1 = ilist[F_GB13].iatoms[j+1];
242 a2 = ilist[F_GB13].iatoms[j+2];
257 if (k == firstinteraction)
265 for (j = 0; j < ilist[F_GB14].nr; j += 3)
267 a1 = ilist[F_GB14].iatoms[j+1];
268 a2 = ilist[F_GB14].iatoms[j+2];
282 if (k == firstinteraction)
288 imin = (firstinteraction < imin) ? firstinteraction : imin;
290 /* round down to j unrolling factor */
291 imin = (imin/UNROLLJ)*UNROLLJ;
293 for (i = ibase; i < imax; i++)
295 max_offset = calc_maxoffset(i, natoms);
299 for (j = 0; j < ilist[F_GB12].nr; j += 3)
301 a1 = ilist[F_GB12].iatoms[j+1];
302 a2 = ilist[F_GB12].iatoms[j+2];
322 if (k > i+max_offset)
329 if (k+natoms <= max_offset)
333 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
338 for (j = 0; j < ilist[F_GB13].nr; j += 3)
340 a1 = ilist[F_GB13].iatoms[j+1];
341 a2 = ilist[F_GB13].iatoms[j+2];
361 if (k > i+max_offset)
368 if (k+natoms <= max_offset)
372 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
377 for (j = 0; j < ilist[F_GB14].nr; j += 3)
379 a1 = ilist[F_GB14].iatoms[j+1];
380 a2 = ilist[F_GB14].iatoms[j+2];
400 if (k > i+max_offset)
407 if (k+natoms <= max_offset)
411 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
416 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
418 /* round up to j unrolling factor */
419 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
421 /* Set all the prologue masks length to this value (even for i>end) */
422 for (i = ibase; i < ibase+UNROLLI; i++)
424 aadata->jindex_gb[4*i] = imin;
425 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
429 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
430 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
432 for (i = ibase; i < ibase+UNROLLI; i++)
434 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
435 imin = aadata->jindex_gb[4*i];
437 /* Allocate aligned memory */
438 snew(pi, nj+2*SIMD_WIDTH);
439 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
441 max_offset = calc_maxoffset(i, natoms);
443 /* Include interactions i+1 <= j < i+maxoffset */
444 for (k = 0; k < nj; k++)
448 if ( (j > i) && (j <= i+max_offset) )
450 aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
454 aadata->prologue_mask_gb[i][k] = 0;
458 /* Clear out the explicit exclusions */
463 for (j = 0; j < ilist[F_GB12].nr; j += 3)
465 a1 = ilist[F_GB12].iatoms[j+1];
466 a2 = ilist[F_GB12].iatoms[j+2];
481 if (k > i+max_offset)
487 if (k+natoms <= max_offset)
495 aadata->prologue_mask_gb[i][k] = 0;
501 for (j = 0; j < ilist[F_GB13].nr; j += 3)
503 a1 = ilist[F_GB13].iatoms[j+1];
504 a2 = ilist[F_GB13].iatoms[j+2];
519 if (k > i+max_offset)
525 if (k+natoms <= max_offset)
533 aadata->prologue_mask_gb[i][k] = 0;
539 for (j = 0; j < ilist[F_GB14].nr; j += 3)
541 a1 = ilist[F_GB14].iatoms[j+1];
542 a2 = ilist[F_GB14].iatoms[j+2];
557 if (k > i+max_offset)
563 if (k+natoms <= max_offset)
571 aadata->prologue_mask_gb[i][k] = 0;
579 /* Construct the epilogue mask - this just contains the check for maxoffset */
580 snew(aadata->epilogue_mask, natoms+UNROLLI);
582 /* First zero everything to avoid uninitialized data */
583 for (i = 0; i < natoms+UNROLLI; i++)
585 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
586 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
587 aadata->epilogue_mask[i] = NULL;
590 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
592 /* Find the lowest index for which we need to use the epilogue */
594 max_offset = calc_maxoffset(imin, natoms);
596 imin = imin + 1 + max_offset;
598 /* Find largest index for which we need to use the epilogue */
599 imax = ibase + UNROLLI-1;
600 imax = (imax < end) ? imax : end;
602 max_offset = calc_maxoffset(imax, natoms);
603 imax = imax + 1 + max_offset + UNROLLJ - 1;
605 for (i = ibase; i < ibase+UNROLLI; i++)
607 /* Start of epilogue - round down to j tile limit */
608 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
609 /* Make sure we dont overlap - for small systems everything is done in the prologue */
610 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
611 /* Round upwards to j tile limit */
612 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
613 /* Make sure we dont have a negative range for the epilogue */
614 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
618 /* And fill it with data... */
620 for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
622 for (i = ibase; i < ibase+UNROLLI; i++)
625 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
627 /* Allocate aligned memory */
628 snew(pi, nj+2*SIMD_WIDTH);
629 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
631 max_offset = calc_maxoffset(i, natoms);
633 for (k = 0; k < nj; k++)
635 j = aadata->jindex_gb[4*i+2] + k;
636 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
644 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
645 gmx_localtop_t * top,
646 gmx_genborn_t * born,
656 gmx_allvsallgb2_data_t *aadata;
659 natoms = mdatoms->nr;
664 snew(p, 2*natoms+2*SIMD_WIDTH);
665 aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
666 snew(p, 2*natoms+2*SIMD_WIDTH);
667 aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
668 snew(p, 2*natoms+2*SIMD_WIDTH);
669 aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
670 snew(p, 2*natoms+2*SIMD_WIDTH);
671 aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
672 snew(p, 2*natoms+2*SIMD_WIDTH);
673 aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
674 snew(p, 2*natoms+2*SIMD_WIDTH);
675 aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
677 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
678 aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
680 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
681 aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
683 snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
684 aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
686 for (i = 0; i < mdatoms->nr; i++)
688 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
689 if (gb_algorithm == egbSTILL)
691 aadata->workparam[i] = born->vsolv[i];
693 else if (gb_algorithm == egbOBC)
695 aadata->workparam[i] = born->param[i];
697 aadata->work[i] = 0.0;
699 for (i = 0; i < mdatoms->nr; i++)
701 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
702 aadata->workparam[natoms+i] = aadata->workparam[i];
703 aadata->work[natoms+i] = aadata->work[i];
706 for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
708 aadata->x_align[i] = 0.0;
709 aadata->y_align[i] = 0.0;
710 aadata->z_align[i] = 0.0;
711 aadata->fx_align[i] = 0.0;
712 aadata->fy_align[i] = 0.0;
713 aadata->fz_align[i] = 0.0;
716 setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
717 bInclude12, bInclude13, bInclude14);
722 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec * fr,
724 gmx_genborn_t * born,
725 gmx_localtop_t * top,
730 gmx_allvsallgb2_data_t *aadata;
733 int nj0, nj1, nj2, nj3;
750 real irsq, idr4, idr6;
751 real raj, rvdw, ratio;
752 real vaj, ccf, dccf, theta, cosq;
753 real term, prod, icf4, icf6, gpi2, factor, sinq;
764 __m128 ix_SSE0, iy_SSE0, iz_SSE0;
765 __m128 ix_SSE1, iy_SSE1, iz_SSE1;
766 __m128 ix_SSE2, iy_SSE2, iz_SSE2;
767 __m128 ix_SSE3, iy_SSE3, iz_SSE3;
768 __m128 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
769 __m128 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
770 __m128 gpi_SSE2, rai_SSE2, prod_ai_SSE2;
771 __m128 gpi_SSE3, rai_SSE3, prod_ai_SSE3;
772 __m128 imask_SSE0, jmask_SSE0;
773 __m128 imask_SSE1, jmask_SSE1;
774 __m128 imask_SSE2, jmask_SSE2;
775 __m128 imask_SSE3, jmask_SSE3;
776 __m128 jx_SSE, jy_SSE, jz_SSE;
777 __m128 dx_SSE0, dy_SSE0, dz_SSE0;
778 __m128 dx_SSE1, dy_SSE1, dz_SSE1;
779 __m128 dx_SSE2, dy_SSE2, dz_SSE2;
780 __m128 dx_SSE3, dy_SSE3, dz_SSE3;
781 __m128 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
782 __m128 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
783 __m128 rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
784 __m128 rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
785 __m128 raj_SSE, vaj_SSE, prod_SSE;
786 __m128 rvdw_SSE0, ratio_SSE0;
787 __m128 rvdw_SSE1, ratio_SSE1;
788 __m128 rvdw_SSE2, ratio_SSE2;
789 __m128 rvdw_SSE3, ratio_SSE3;
790 __m128 theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
791 __m128 theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
792 __m128 theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
793 __m128 theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
794 __m128 ccf_SSE0, dccf_SSE0;
795 __m128 ccf_SSE1, dccf_SSE1;
796 __m128 ccf_SSE2, dccf_SSE2;
797 __m128 ccf_SSE3, dccf_SSE3;
798 __m128 icf4_SSE0, icf6_SSE0;
799 __m128 icf4_SSE1, icf6_SSE1;
800 __m128 icf4_SSE2, icf6_SSE2;
801 __m128 icf4_SSE3, icf6_SSE3;
802 __m128 half_SSE, one_SSE, two_SSE, four_SSE;
803 __m128 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
805 natoms = mdatoms->nr;
807 ni1 = mdatoms->homenr;
811 aadata = *((gmx_allvsallgb2_data_t **)paadata);
816 genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
817 egbSTILL, FALSE, FALSE, TRUE);
818 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
821 x_align = aadata->x_align;
822 y_align = aadata->y_align;
823 z_align = aadata->z_align;
825 gb_radius = aadata->gb_radius;
826 vsolv = aadata->workparam;
828 jindex = aadata->jindex_gb;
831 still_p4_SSE = _mm_set1_ps(STILL_P4);
832 still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
833 still_pip5_SSE = _mm_set1_ps(STILL_PIP5);
834 half_SSE = _mm_set1_ps(0.5);
835 one_SSE = _mm_set1_ps(1.0);
836 two_SSE = _mm_set1_ps(2.0);
837 four_SSE = _mm_set1_ps(4.0);
839 /* This will be summed, so it has to extend to natoms + buffer */
840 for (i = 0; i < natoms+1+natoms/2; i++)
845 for (i = ni0; i < ni1+1+natoms/2; i++)
849 y_align[i] = x[3*k+1];
850 z_align[i] = x[3*k+2];
855 for (i = ni0; i < ni1; i += UNROLLI)
857 /* We assume shifts are NOT used for all-vs-all interactions */
859 /* Load i atom data */
860 ix_SSE0 = _mm_load1_ps(x_align+i);
861 iy_SSE0 = _mm_load1_ps(y_align+i);
862 iz_SSE0 = _mm_load1_ps(z_align+i);
863 ix_SSE1 = _mm_load1_ps(x_align+i+1);
864 iy_SSE1 = _mm_load1_ps(y_align+i+1);
865 iz_SSE1 = _mm_load1_ps(z_align+i+1);
866 ix_SSE2 = _mm_load1_ps(x_align+i+2);
867 iy_SSE2 = _mm_load1_ps(y_align+i+2);
868 iz_SSE2 = _mm_load1_ps(z_align+i+2);
869 ix_SSE3 = _mm_load1_ps(x_align+i+3);
870 iy_SSE3 = _mm_load1_ps(y_align+i+3);
871 iz_SSE3 = _mm_load1_ps(z_align+i+3);
873 gpi_SSE0 = _mm_setzero_ps();
874 gpi_SSE1 = _mm_setzero_ps();
875 gpi_SSE2 = _mm_setzero_ps();
876 gpi_SSE3 = _mm_setzero_ps();
878 rai_SSE0 = _mm_load1_ps(gb_radius+i);
879 rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
880 rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
881 rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
883 prod_ai_SSE0 = _mm_set1_ps(STILL_P4*vsolv[i]);
884 prod_ai_SSE1 = _mm_set1_ps(STILL_P4*vsolv[i+1]);
885 prod_ai_SSE2 = _mm_set1_ps(STILL_P4*vsolv[i+2]);
886 prod_ai_SSE3 = _mm_set1_ps(STILL_P4*vsolv[i+3]);
888 /* Load limits for loop over neighbors */
894 pmask0 = aadata->prologue_mask_gb[i];
895 pmask1 = aadata->prologue_mask_gb[i+1];
896 pmask2 = aadata->prologue_mask_gb[i+2];
897 pmask3 = aadata->prologue_mask_gb[i+3];
898 emask0 = aadata->epilogue_mask[i];
899 emask1 = aadata->epilogue_mask[i+1];
900 emask2 = aadata->epilogue_mask[i+2];
901 emask3 = aadata->epilogue_mask[i+3];
903 imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
904 imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
905 imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
906 imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
908 /* Prologue part, including exclusion mask */
909 for (j = nj0; j < nj1; j += UNROLLJ)
911 jmask_SSE0 = _mm_load_ps((real *)pmask0);
912 jmask_SSE1 = _mm_load_ps((real *)pmask1);
913 jmask_SSE2 = _mm_load_ps((real *)pmask2);
914 jmask_SSE3 = _mm_load_ps((real *)pmask3);
920 /* load j atom coordinates */
921 jx_SSE = _mm_load_ps(x_align+j);
922 jy_SSE = _mm_load_ps(y_align+j);
923 jz_SSE = _mm_load_ps(z_align+j);
925 /* Calculate distance */
926 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
927 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
928 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
929 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
930 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
931 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
932 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
933 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
934 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
935 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
936 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
937 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
939 /* rsq = dx*dx+dy*dy+dz*dz */
940 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
941 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
942 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
943 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
946 jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
947 jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
948 jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
949 jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
951 /* Calculate 1/r and 1/r2 */
952 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
953 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
954 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
955 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
958 rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
959 rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
960 rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
961 rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
963 irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
964 irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
965 irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
966 irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
967 idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
968 idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
969 idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
970 idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
971 idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
972 idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
973 idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
974 idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
976 raj_SSE = _mm_load_ps(gb_radius+j);
977 vaj_SSE = _mm_load_ps(vsolv+j);
979 rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
980 rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
981 rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
982 rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
984 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
985 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
986 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
987 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
989 ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
990 ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
991 ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
992 ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
993 theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
994 theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
995 theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
996 theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
997 gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
998 gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
999 gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1000 gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1001 term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1002 term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1003 term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1004 term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1005 ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
1006 ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
1007 ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
1008 ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
1009 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1010 _mm_mul_ps(sinq_SSE0, theta_SSE0));
1011 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1012 _mm_mul_ps(sinq_SSE1, theta_SSE1));
1013 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1014 _mm_mul_ps(sinq_SSE2, theta_SSE2));
1015 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1016 _mm_mul_ps(sinq_SSE3, theta_SSE3));
1018 prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1019 icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1020 icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1021 icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1022 icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1023 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1024 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1025 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1026 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1028 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1029 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1030 _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1031 _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1032 _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1034 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1035 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1036 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1037 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1039 /* Save ai->aj and aj->ai chain rule terms */
1040 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1042 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1044 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1046 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1049 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1051 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1053 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1055 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1059 /* Main part, no exclusions */
1060 for (j = nj1; j < nj2; j += UNROLLJ)
1062 /* load j atom coordinates */
1063 jx_SSE = _mm_load_ps(x_align+j);
1064 jy_SSE = _mm_load_ps(y_align+j);
1065 jz_SSE = _mm_load_ps(z_align+j);
1067 /* Calculate distance */
1068 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
1069 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
1070 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
1071 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
1072 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
1073 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
1074 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
1075 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
1076 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
1077 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
1078 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
1079 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
1081 /* rsq = dx*dx+dy*dy+dz*dz */
1082 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1083 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1084 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1085 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1087 /* Calculate 1/r and 1/r2 */
1088 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1089 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1090 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1091 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1094 rinv_SSE0 = _mm_and_ps(rinv_SSE0, imask_SSE0);
1095 rinv_SSE1 = _mm_and_ps(rinv_SSE1, imask_SSE1);
1096 rinv_SSE2 = _mm_and_ps(rinv_SSE2, imask_SSE2);
1097 rinv_SSE3 = _mm_and_ps(rinv_SSE3, imask_SSE3);
1099 irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1100 irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1101 irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1102 irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1103 idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1104 idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1105 idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1106 idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1107 idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1108 idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1109 idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1110 idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1112 raj_SSE = _mm_load_ps(gb_radius+j);
1114 rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
1115 rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
1116 rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
1117 rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
1118 vaj_SSE = _mm_load_ps(vsolv+j);
1120 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1121 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1122 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1123 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1125 ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1126 ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1127 ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1128 ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1129 theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1130 theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1131 theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1132 theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1133 gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1134 gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1135 gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1136 gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1137 term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1138 term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1139 term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1140 term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1141 ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
1142 ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
1143 ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
1144 ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
1145 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1146 _mm_mul_ps(sinq_SSE0, theta_SSE0));
1147 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1148 _mm_mul_ps(sinq_SSE1, theta_SSE1));
1149 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1150 _mm_mul_ps(sinq_SSE2, theta_SSE2));
1151 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1152 _mm_mul_ps(sinq_SSE3, theta_SSE3));
1154 prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE );
1155 icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1156 icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1157 icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1158 icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1159 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1160 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1161 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1162 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1164 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1165 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1166 _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1167 _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1168 _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1170 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1171 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1172 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1173 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1175 /* Save ai->aj and aj->ai chain rule terms */
1176 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1178 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1180 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1182 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1185 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1187 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1189 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1191 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1194 /* Epilogue part, including exclusion mask */
1195 for (j = nj2; j < nj3; j += UNROLLJ)
1197 jmask_SSE0 = _mm_load_ps((real *)emask0);
1198 jmask_SSE1 = _mm_load_ps((real *)emask1);
1199 jmask_SSE2 = _mm_load_ps((real *)emask2);
1200 jmask_SSE3 = _mm_load_ps((real *)emask3);
1206 /* load j atom coordinates */
1207 jx_SSE = _mm_load_ps(x_align+j);
1208 jy_SSE = _mm_load_ps(y_align+j);
1209 jz_SSE = _mm_load_ps(z_align+j);
1211 /* Calculate distance */
1212 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
1213 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
1214 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
1215 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
1216 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
1217 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
1218 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
1219 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
1220 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
1221 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
1222 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
1223 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
1225 /* rsq = dx*dx+dy*dy+dz*dz */
1226 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1227 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1228 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1229 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1232 jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
1233 jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
1234 jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
1235 jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
1237 /* Calculate 1/r and 1/r2 */
1238 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1239 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1240 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1241 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1244 rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1245 rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1246 rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1247 rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1249 irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1250 irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1251 irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1252 irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1253 idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1254 idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1255 idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1256 idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1257 idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1258 idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1259 idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1260 idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1262 raj_SSE = _mm_load_ps(gb_radius+j);
1263 vaj_SSE = _mm_load_ps(vsolv+j);
1265 rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
1266 rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
1267 rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
1268 rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
1270 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1271 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1272 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1273 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1275 ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1276 ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1277 ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1278 ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1279 theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1280 theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1281 theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1282 theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1283 gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1284 gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1285 gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1286 gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1287 term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1288 term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1289 term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1290 term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1291 ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
1292 ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
1293 ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
1294 ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
1295 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1296 _mm_mul_ps(sinq_SSE0, theta_SSE0));
1297 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1298 _mm_mul_ps(sinq_SSE1, theta_SSE1));
1299 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1300 _mm_mul_ps(sinq_SSE2, theta_SSE2));
1301 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1302 _mm_mul_ps(sinq_SSE3, theta_SSE3));
1304 prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1305 icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1306 icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1307 icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1308 icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1309 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1310 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1311 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1312 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1314 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1315 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1316 _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1317 _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1318 _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1320 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1321 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1322 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1323 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1325 /* Save ai->aj and aj->ai chain rule terms */
1326 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1328 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1330 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1332 _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1335 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1337 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1339 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1341 _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1344 _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
1345 gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
1346 gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
1347 gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
1348 _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1351 /* In case we have written anything beyond natoms, move it back.
1352 * Never mind that we leave stuff above natoms; that will not
1353 * be accessed later in the routine.
1354 * In principle this should be a move rather than sum, but this
1355 * way we dont have to worry about even/odd offsets...
1357 for (i = natoms; i < ni1+1+natoms/2; i++)
1359 work[i-natoms] += work[i];
1362 /* Parallel summations would go here if ever implemented with DD */
1364 factor = 0.5 * ONE_4PI_EPS0;
1365 /* Calculate the radii - should we do all atoms, or just our local ones? */
1366 for (i = 0; i < natoms; i++)
1368 if (born->use[i] != 0)
1370 gpi = born->gpol[i]+work[i];
1372 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1373 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1383 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec * fr,
1384 t_mdatoms * mdatoms,
1385 gmx_genborn_t * born,
1387 gmx_localtop_t * top,
1392 gmx_allvsallgb2_data_t *aadata;
1395 int nj0, nj1, nj2, nj3;
1417 real rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1419 __m128 ix_SSE0, iy_SSE0, iz_SSE0;
1420 __m128 ix_SSE1, iy_SSE1, iz_SSE1;
1421 __m128 ix_SSE2, iy_SSE2, iz_SSE2;
1422 __m128 ix_SSE3, iy_SSE3, iz_SSE3;
1423 __m128 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1424 __m128 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1425 __m128 gpi_SSE2, rai_SSE2, prod_ai_SSE2;
1426 __m128 gpi_SSE3, rai_SSE3, prod_ai_SSE3;
1427 __m128 imask_SSE0, jmask_SSE0;
1428 __m128 imask_SSE1, jmask_SSE1;
1429 __m128 imask_SSE2, jmask_SSE2;
1430 __m128 imask_SSE3, jmask_SSE3;
1431 __m128 jx_SSE, jy_SSE, jz_SSE;
1432 __m128 dx_SSE0, dy_SSE0, dz_SSE0;
1433 __m128 dx_SSE1, dy_SSE1, dz_SSE1;
1434 __m128 dx_SSE2, dy_SSE2, dz_SSE2;
1435 __m128 dx_SSE3, dy_SSE3, dz_SSE3;
1436 __m128 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1437 __m128 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1438 __m128 rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
1439 __m128 rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
1440 __m128 raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1441 __m128 ccf_SSE0, dccf_SSE0, prod_SSE0;
1442 __m128 ccf_SSE1, dccf_SSE1, prod_SSE1;
1443 __m128 ccf_SSE2, dccf_SSE2, prod_SSE2;
1444 __m128 ccf_SSE3, dccf_SSE3, prod_SSE3;
1445 __m128 icf4_SSE0, icf6_SSE0;
1446 __m128 icf4_SSE1, icf6_SSE1;
1447 __m128 icf4_SSE2, icf6_SSE2;
1448 __m128 icf4_SSE3, icf6_SSE3;
1449 __m128 oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1450 __m128 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1451 __m128 rai_inv_SSE0;
1452 __m128 rai_inv_SSE1;
1453 __m128 rai_inv_SSE2;
1454 __m128 rai_inv_SSE3;
1455 __m128 sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1456 __m128 sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1457 __m128 sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
1458 __m128 sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
1459 __m128 lij_inv_SSE0, sk2_rinv_SSE0;
1460 __m128 lij_inv_SSE1, sk2_rinv_SSE1;
1461 __m128 lij_inv_SSE2, sk2_rinv_SSE2;
1462 __m128 lij_inv_SSE3, sk2_rinv_SSE3;
1467 __m128 t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1468 __m128 t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1469 __m128 t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
1470 __m128 t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
1471 __m128 obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1472 __m128 obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1473 __m128 obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
1474 __m128 obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
1475 __m128 uij_SSE0, uij2_SSE0, uij3_SSE0;
1476 __m128 uij_SSE1, uij2_SSE1, uij3_SSE1;
1477 __m128 uij_SSE2, uij2_SSE2, uij3_SSE2;
1478 __m128 uij_SSE3, uij2_SSE3, uij3_SSE3;
1479 __m128 lij_SSE0, lij2_SSE0, lij3_SSE0;
1480 __m128 lij_SSE1, lij2_SSE1, lij3_SSE1;
1481 __m128 lij_SSE2, lij2_SSE2, lij3_SSE2;
1482 __m128 lij_SSE3, lij2_SSE3, lij3_SSE3;
1483 __m128 dlij_SSE0, diff2_SSE0, logterm_SSE0;
1484 __m128 dlij_SSE1, diff2_SSE1, logterm_SSE1;
1485 __m128 dlij_SSE2, diff2_SSE2, logterm_SSE2;
1486 __m128 dlij_SSE3, diff2_SSE3, logterm_SSE3;
1489 natoms = mdatoms->nr;
1491 ni1 = mdatoms->homenr;
1495 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1500 genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1501 egbOBC, TRUE, TRUE, TRUE);
1502 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1505 x_align = aadata->x_align;
1506 y_align = aadata->y_align;
1507 z_align = aadata->z_align;
1509 gb_radius = aadata->gb_radius;
1510 work = aadata->work;
1511 jindex = aadata->jindex_gb;
1513 obc_param = aadata->workparam;
1515 oneeighth_SSE = _mm_set1_ps(0.125);
1516 onefourth_SSE = _mm_set1_ps(0.25);
1517 half_SSE = _mm_set1_ps(0.5);
1518 one_SSE = _mm_set1_ps(1.0);
1519 two_SSE = _mm_set1_ps(2.0);
1520 four_SSE = _mm_set1_ps(4.0);
1521 doffset_SSE = _mm_set1_ps(born->gb_doffset);
1523 for (i = 0; i < natoms; i++)
1525 x_align[i] = x[3*i];
1526 y_align[i] = x[3*i+1];
1527 z_align[i] = x[3*i+2];
1531 for (i = 0; i < natoms/2+1; i++)
1533 x_align[natoms+i] = x_align[i];
1534 y_align[natoms+i] = y_align[i];
1535 z_align[natoms+i] = z_align[i];
1538 for (i = 0; i < natoms+natoms/2+1; i++)
1543 for (i = ni0; i < ni1; i += UNROLLI)
1545 /* We assume shifts are NOT used for all-vs-all interactions */
1547 /* Load i atom data */
1548 ix_SSE0 = _mm_load1_ps(x_align+i);
1549 iy_SSE0 = _mm_load1_ps(y_align+i);
1550 iz_SSE0 = _mm_load1_ps(z_align+i);
1551 ix_SSE1 = _mm_load1_ps(x_align+i+1);
1552 iy_SSE1 = _mm_load1_ps(y_align+i+1);
1553 iz_SSE1 = _mm_load1_ps(z_align+i+1);
1554 ix_SSE2 = _mm_load1_ps(x_align+i+2);
1555 iy_SSE2 = _mm_load1_ps(y_align+i+2);
1556 iz_SSE2 = _mm_load1_ps(z_align+i+2);
1557 ix_SSE3 = _mm_load1_ps(x_align+i+3);
1558 iy_SSE3 = _mm_load1_ps(y_align+i+3);
1559 iz_SSE3 = _mm_load1_ps(z_align+i+3);
1561 rai_SSE0 = _mm_load1_ps(gb_radius+i);
1562 rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
1563 rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
1564 rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
1565 rai_inv_SSE0 = gmx_mm_inv_ps(rai_SSE0);
1566 rai_inv_SSE1 = gmx_mm_inv_ps(rai_SSE1);
1567 rai_inv_SSE2 = gmx_mm_inv_ps(rai_SSE2);
1568 rai_inv_SSE3 = gmx_mm_inv_ps(rai_SSE3);
1570 sk_ai_SSE0 = _mm_load1_ps(obc_param+i);
1571 sk_ai_SSE1 = _mm_load1_ps(obc_param+i+1);
1572 sk_ai_SSE2 = _mm_load1_ps(obc_param+i+2);
1573 sk_ai_SSE3 = _mm_load1_ps(obc_param+i+3);
1574 sk2_ai_SSE0 = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
1575 sk2_ai_SSE1 = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
1576 sk2_ai_SSE2 = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
1577 sk2_ai_SSE3 = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
1579 sum_ai_SSE0 = _mm_setzero_ps();
1580 sum_ai_SSE1 = _mm_setzero_ps();
1581 sum_ai_SSE2 = _mm_setzero_ps();
1582 sum_ai_SSE3 = _mm_setzero_ps();
1584 /* Load limits for loop over neighbors */
1586 nj1 = jindex[4*i+1];
1587 nj2 = jindex[4*i+2];
1588 nj3 = jindex[4*i+3];
1590 pmask0 = aadata->prologue_mask_gb[i];
1591 pmask1 = aadata->prologue_mask_gb[i+1];
1592 pmask2 = aadata->prologue_mask_gb[i+2];
1593 pmask3 = aadata->prologue_mask_gb[i+3];
1594 emask0 = aadata->epilogue_mask[i];
1595 emask1 = aadata->epilogue_mask[i+1];
1596 emask2 = aadata->epilogue_mask[i+2];
1597 emask3 = aadata->epilogue_mask[i+3];
1599 imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
1600 imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
1601 imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
1602 imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
1604 /* Prologue part, including exclusion mask */
1605 for (j = nj0; j < nj1; j += UNROLLJ)
1607 jmask_SSE0 = _mm_load_ps((real *)pmask0);
1608 jmask_SSE1 = _mm_load_ps((real *)pmask1);
1609 jmask_SSE2 = _mm_load_ps((real *)pmask2);
1610 jmask_SSE3 = _mm_load_ps((real *)pmask3);
1616 /* load j atom coordinates */
1617 jx_SSE = _mm_load_ps(x_align+j);
1618 jy_SSE = _mm_load_ps(y_align+j);
1619 jz_SSE = _mm_load_ps(z_align+j);
1621 /* Calculate distance */
1622 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
1623 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
1624 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
1625 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
1626 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
1627 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
1628 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
1629 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
1630 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
1631 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
1632 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
1633 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
1635 /* rsq = dx*dx+dy*dy+dz*dz */
1636 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1637 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1638 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1639 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1642 jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
1643 jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
1644 jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
1645 jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
1647 /* Calculate 1/r and 1/r2 */
1648 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1649 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1650 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1651 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1654 rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1655 rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1656 rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1657 rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1659 dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
1660 dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
1661 dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
1662 dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
1664 sk_aj_SSE = _mm_load_ps(obc_param+j);
1665 raj_SSE = _mm_load_ps(gb_radius+j);
1666 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
1668 /* Evaluate influence of atom aj -> ai */
1669 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
1670 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
1671 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
1672 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
1673 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
1674 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
1675 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
1676 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
1677 t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
1678 t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
1679 t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
1680 t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
1682 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1683 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1684 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1685 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1686 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1687 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1688 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1689 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1690 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1691 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1692 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1693 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1694 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1695 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1696 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1697 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1699 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
1700 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
1701 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
1702 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
1703 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1704 _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
1705 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1706 _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
1707 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1708 _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
1709 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1710 _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
1711 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1712 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1713 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1714 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1716 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
1717 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
1718 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
1719 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
1720 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1721 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1722 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1723 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1724 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
1725 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
1726 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
1727 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
1728 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1729 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1730 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1731 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1733 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1734 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1735 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1736 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1737 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
1738 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
1739 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
1740 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
1741 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
1742 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
1743 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
1744 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
1745 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
1746 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1747 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1748 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1749 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1751 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1752 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1753 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1754 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1756 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
1757 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
1758 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
1759 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
1760 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
1761 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1763 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
1764 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1766 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
1767 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1769 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
1770 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1773 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1774 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
1775 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
1776 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
1777 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
1778 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
1779 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
1780 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
1781 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
1782 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
1783 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
1784 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
1785 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
1786 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
1787 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
1788 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
1789 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
1790 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
1791 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
1792 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
1794 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1795 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1796 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1797 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1799 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
1800 _mm_mul_ps(prod_SSE0, lij3_SSE0));
1801 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
1802 _mm_mul_ps(prod_SSE1, lij3_SSE1));
1803 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
1804 _mm_mul_ps(prod_SSE2, lij3_SSE2));
1805 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
1806 _mm_mul_ps(prod_SSE3, lij3_SSE3));
1807 t1_SSE0 = _mm_sub_ps(t1_SSE0,
1808 _mm_mul_ps(onefourth_SSE,
1809 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
1810 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
1811 t1_SSE1 = _mm_sub_ps(t1_SSE1,
1812 _mm_mul_ps(onefourth_SSE,
1813 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
1814 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
1815 t1_SSE2 = _mm_sub_ps(t1_SSE2,
1816 _mm_mul_ps(onefourth_SSE,
1817 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
1818 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
1819 t1_SSE3 = _mm_sub_ps(t1_SSE3,
1820 _mm_mul_ps(onefourth_SSE,
1821 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
1822 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
1824 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
1825 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
1826 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
1827 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
1828 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
1829 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
1830 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
1831 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
1832 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
1833 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
1834 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
1835 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
1836 t2_SSE0 = _mm_sub_ps(t2_SSE0,
1837 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
1838 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
1839 t2_SSE1 = _mm_sub_ps(t2_SSE1,
1840 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
1841 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
1842 t2_SSE2 = _mm_sub_ps(t2_SSE2,
1843 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
1844 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
1845 t2_SSE3 = _mm_sub_ps(t2_SSE3,
1846 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
1847 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
1848 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
1849 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
1850 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
1851 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
1852 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
1853 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
1854 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
1855 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
1856 t3_SSE0 = _mm_sub_ps(t3_SSE0,
1857 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
1859 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
1860 t3_SSE1 = _mm_sub_ps(t3_SSE1,
1861 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
1863 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
1864 t3_SSE2 = _mm_sub_ps(t3_SSE2,
1865 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
1867 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
1868 t3_SSE3 = _mm_sub_ps(t3_SSE3,
1869 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
1871 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
1873 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
1874 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
1875 _mm_add_ps(t2_SSE0, t3_SSE0)));
1876 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
1877 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
1878 _mm_add_ps(t2_SSE1, t3_SSE1)));
1879 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
1880 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
1881 _mm_add_ps(t2_SSE2, t3_SSE2)));
1882 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
1883 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
1884 _mm_add_ps(t2_SSE3, t3_SSE3)));
1886 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1888 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1890 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1892 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1895 /* Evaluate influence of atom ai -> aj */
1896 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
1897 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
1898 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
1899 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
1900 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
1901 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
1902 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
1903 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
1904 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
1905 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
1906 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
1907 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
1909 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1910 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1911 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1912 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1913 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1914 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1915 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1916 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1917 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1918 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1919 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1920 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1921 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1922 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1923 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1924 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1926 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
1927 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
1928 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
1929 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
1930 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1931 _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
1932 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1933 _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
1934 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1935 _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
1936 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1937 _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
1938 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1939 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1940 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1941 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1943 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
1944 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
1945 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
1946 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
1947 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1948 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1949 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1950 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1951 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
1952 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
1953 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
1954 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
1955 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1956 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1957 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1958 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1960 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1961 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1962 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1963 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1964 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
1965 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
1966 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
1967 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
1968 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
1969 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
1970 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
1971 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
1972 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1973 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1974 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1975 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1977 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1978 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1979 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1980 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1981 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
1982 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
1983 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
1984 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
1985 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
1986 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1988 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
1989 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1991 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
1992 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1994 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
1995 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1997 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1998 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
1999 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2000 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2001 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2002 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2003 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2004 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2005 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2006 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2007 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2008 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2009 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2010 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2011 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2012 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2013 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2014 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2015 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2016 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2018 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2019 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2020 _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2021 _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2022 _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2024 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2025 _mm_mul_ps(prod_SSE0, lij3_SSE0));
2026 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2027 _mm_mul_ps(prod_SSE1, lij3_SSE1));
2028 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2029 _mm_mul_ps(prod_SSE2, lij3_SSE2));
2030 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2031 _mm_mul_ps(prod_SSE3, lij3_SSE3));
2032 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2033 _mm_mul_ps(onefourth_SSE,
2034 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2035 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2036 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2037 _mm_mul_ps(onefourth_SSE,
2038 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2039 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2040 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2041 _mm_mul_ps(onefourth_SSE,
2042 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2043 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2044 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2045 _mm_mul_ps(onefourth_SSE,
2046 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2047 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2048 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2049 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2050 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2051 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2052 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2053 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2054 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2055 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2056 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2057 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2058 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2059 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2060 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2061 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2062 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2063 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2064 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2065 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2066 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2067 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2068 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2069 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2070 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2071 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2073 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2074 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2075 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2076 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2077 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2078 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2079 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2080 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2082 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2083 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2085 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2086 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2087 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2089 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2090 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2091 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2093 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2094 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2095 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2097 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2100 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2101 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2102 _mm_add_ps(t2_SSE0, t3_SSE0)));
2103 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2104 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2105 _mm_add_ps(t2_SSE1, t3_SSE1)));
2106 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2107 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2108 _mm_add_ps(t2_SSE2, t3_SSE2)));
2109 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2110 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2111 _mm_add_ps(t2_SSE3, t3_SSE3)));
2113 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2115 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2117 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2119 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2124 /* Main part, no exclusions */
2125 for (j = nj1; j < nj2; j += UNROLLJ)
2127 /* load j atom coordinates */
2128 jx_SSE = _mm_load_ps(x_align+j);
2129 jy_SSE = _mm_load_ps(y_align+j);
2130 jz_SSE = _mm_load_ps(z_align+j);
2132 /* Calculate distance */
2133 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
2134 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
2135 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
2136 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
2137 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
2138 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
2139 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
2140 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
2141 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
2142 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
2143 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
2144 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
2146 /* rsq = dx*dx+dy*dy+dz*dz */
2147 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2148 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2149 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2150 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2152 /* Calculate 1/r and 1/r2 */
2153 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
2154 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
2155 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
2156 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
2159 rinv_SSE0 = _mm_and_ps(rinv_SSE0, imask_SSE0);
2160 rinv_SSE1 = _mm_and_ps(rinv_SSE1, imask_SSE1);
2161 rinv_SSE2 = _mm_and_ps(rinv_SSE2, imask_SSE2);
2162 rinv_SSE3 = _mm_and_ps(rinv_SSE3, imask_SSE3);
2164 dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2165 dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2166 dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2167 dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2169 sk_aj_SSE = _mm_load_ps(obc_param+j);
2170 raj_SSE = _mm_load_ps(gb_radius+j);
2172 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
2174 /* Evaluate influence of atom aj -> ai */
2175 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2176 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2177 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2178 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2179 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2180 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2181 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2182 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2183 t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2184 t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2185 t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2186 t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2188 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2189 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2190 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2191 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2192 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2193 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2194 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2195 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2196 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2197 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2198 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2199 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2200 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2201 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2202 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2203 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2205 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2206 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2207 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2208 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2209 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2210 _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2211 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2212 _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2213 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2214 _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2215 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2216 _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2217 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2218 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2219 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2220 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2222 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2223 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2224 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2225 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2226 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2227 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2228 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2229 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2230 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2231 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2232 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2233 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2234 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2235 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2236 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2237 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2239 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2240 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2241 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2242 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2243 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2244 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2245 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2246 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2247 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2248 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2249 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2250 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2251 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2252 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2253 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2254 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2255 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2257 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2258 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2259 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2260 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2262 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
2263 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
2264 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
2265 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
2266 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2267 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2269 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2270 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2272 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2273 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2275 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2276 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2279 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2280 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2281 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2282 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2283 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2284 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2285 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2286 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2287 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2288 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2289 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2290 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2291 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2292 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2293 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2294 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2295 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2296 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2297 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2298 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2300 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2301 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2302 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2303 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2305 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2306 _mm_mul_ps(prod_SSE0, lij3_SSE0));
2307 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2308 _mm_mul_ps(prod_SSE1, lij3_SSE1));
2309 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2310 _mm_mul_ps(prod_SSE2, lij3_SSE2));
2311 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2312 _mm_mul_ps(prod_SSE3, lij3_SSE3));
2313 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2314 _mm_mul_ps(onefourth_SSE,
2315 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2316 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2317 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2318 _mm_mul_ps(onefourth_SSE,
2319 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2320 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2321 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2322 _mm_mul_ps(onefourth_SSE,
2323 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2324 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2325 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2326 _mm_mul_ps(onefourth_SSE,
2327 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2328 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2330 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2331 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2332 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2333 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2334 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2335 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2336 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2337 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2338 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2339 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2340 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2341 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2342 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2343 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2344 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2345 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2346 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2347 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2348 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2349 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2350 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2351 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2352 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2353 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2354 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2355 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2356 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2357 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2358 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2359 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2360 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2361 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2362 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2363 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2365 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2366 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2367 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2369 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2370 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2371 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2373 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2374 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2375 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2377 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2379 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2380 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2381 _mm_add_ps(t2_SSE0, t3_SSE0)));
2382 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2383 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2384 _mm_add_ps(t2_SSE1, t3_SSE1)));
2385 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2386 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2387 _mm_add_ps(t2_SSE2, t3_SSE2)));
2388 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2389 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2390 _mm_add_ps(t2_SSE3, t3_SSE3)));
2392 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2394 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2396 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2398 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2401 /* Evaluate influence of atom ai -> aj */
2402 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2403 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2404 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2405 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2406 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2407 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2408 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2409 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2410 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2411 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2412 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2413 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2415 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2416 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2417 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2418 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2419 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2420 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2421 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2422 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2423 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2424 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2425 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2426 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2427 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2428 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2429 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2430 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2432 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2433 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2434 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2435 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2436 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2437 _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2438 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2439 _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2440 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2441 _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2442 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2443 _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2444 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2445 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2446 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2447 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2449 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2450 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2451 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2452 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2453 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2454 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2455 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2456 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2457 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2458 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2459 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2460 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2461 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2462 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2463 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2464 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2466 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2467 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2468 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2469 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2470 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2471 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2472 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2473 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2474 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2475 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2476 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2477 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2478 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2479 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2480 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2481 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2483 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2484 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2485 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2486 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2487 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
2488 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
2489 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
2490 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
2491 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2492 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2494 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2495 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2497 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2498 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2500 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2501 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2503 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2504 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2505 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2506 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2507 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2508 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2509 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2510 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2511 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2512 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2513 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2514 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2515 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2516 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2517 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2518 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2519 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2520 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2521 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2522 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2524 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2525 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2526 _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2527 _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2528 _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2530 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2531 _mm_mul_ps(prod_SSE0, lij3_SSE0));
2532 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2533 _mm_mul_ps(prod_SSE1, lij3_SSE1));
2534 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2535 _mm_mul_ps(prod_SSE2, lij3_SSE2));
2536 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2537 _mm_mul_ps(prod_SSE3, lij3_SSE3));
2538 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2539 _mm_mul_ps(onefourth_SSE,
2540 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2541 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2542 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2543 _mm_mul_ps(onefourth_SSE,
2544 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2545 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2546 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2547 _mm_mul_ps(onefourth_SSE,
2548 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2549 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2550 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2551 _mm_mul_ps(onefourth_SSE,
2552 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2553 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2554 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2555 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2556 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2557 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2558 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2559 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2560 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2561 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2562 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2563 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2564 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2565 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2566 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2567 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2568 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2569 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2570 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2571 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2572 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2573 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2574 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2575 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2576 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2577 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2579 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2580 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2581 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2582 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2583 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2584 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2585 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2586 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2588 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2589 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2591 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2592 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2593 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2595 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2596 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2597 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2599 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2600 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2601 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2603 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2605 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2606 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2607 _mm_add_ps(t2_SSE0, t3_SSE0)));
2608 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2609 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2610 _mm_add_ps(t2_SSE1, t3_SSE1)));
2611 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2612 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2613 _mm_add_ps(t2_SSE2, t3_SSE2)));
2614 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2615 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2616 _mm_add_ps(t2_SSE3, t3_SSE3)));
2618 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2620 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2622 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2624 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2628 /* Epilogue part, including exclusion mask */
2629 for (j = nj2; j < nj3; j += UNROLLJ)
2631 jmask_SSE0 = _mm_load_ps((real *)emask0);
2632 jmask_SSE1 = _mm_load_ps((real *)emask1);
2633 jmask_SSE2 = _mm_load_ps((real *)emask2);
2634 jmask_SSE3 = _mm_load_ps((real *)emask3);
2640 /* load j atom coordinates */
2641 jx_SSE = _mm_load_ps(x_align+j);
2642 jy_SSE = _mm_load_ps(y_align+j);
2643 jz_SSE = _mm_load_ps(z_align+j);
2645 /* Calculate distance */
2646 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
2647 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
2648 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
2649 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
2650 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
2651 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
2652 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
2653 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
2654 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
2655 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
2656 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
2657 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
2659 /* rsq = dx*dx+dy*dy+dz*dz */
2660 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2661 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2662 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2663 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2666 jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
2667 jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
2668 jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
2669 jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
2671 /* Calculate 1/r and 1/r2 */
2672 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
2673 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
2674 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
2675 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
2678 rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
2679 rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
2680 rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
2681 rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
2683 dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2684 dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2685 dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2686 dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2688 sk_aj_SSE = _mm_load_ps(obc_param+j);
2689 raj_SSE = _mm_load_ps(gb_radius+j);
2691 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
2693 /* Evaluate influence of atom aj -> ai */
2694 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2695 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2696 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2697 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2698 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2699 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2700 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2701 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2702 t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2703 t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2704 t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2705 t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2707 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2708 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2709 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2710 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2711 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2712 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2713 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2714 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2715 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2716 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2717 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2718 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2719 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2720 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2721 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2722 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2724 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2725 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2726 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2727 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2728 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2729 _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2730 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2731 _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2732 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2733 _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2734 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2735 _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2736 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2737 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2738 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2739 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2741 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2742 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2743 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2744 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2745 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2746 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2747 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2748 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2749 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2750 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2751 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2752 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2753 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2754 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2755 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2756 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2758 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2759 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2760 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2761 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2762 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2763 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2764 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2765 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2766 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2767 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2768 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2769 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2770 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2771 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2772 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2773 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2774 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2776 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2777 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2778 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2779 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2781 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
2782 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
2783 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
2784 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
2785 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2786 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2788 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2789 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2791 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2792 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2794 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2795 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2798 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2799 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2800 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2801 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2802 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2803 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2804 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2805 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2806 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2807 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2808 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2809 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2810 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2811 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2812 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2813 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2814 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2815 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2816 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2817 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2819 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2820 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2821 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2822 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2824 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2825 _mm_mul_ps(prod_SSE0, lij3_SSE0));
2826 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2827 _mm_mul_ps(prod_SSE1, lij3_SSE1));
2828 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2829 _mm_mul_ps(prod_SSE2, lij3_SSE2));
2830 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2831 _mm_mul_ps(prod_SSE3, lij3_SSE3));
2832 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2833 _mm_mul_ps(onefourth_SSE,
2834 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2835 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2836 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2837 _mm_mul_ps(onefourth_SSE,
2838 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2839 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2840 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2841 _mm_mul_ps(onefourth_SSE,
2842 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2843 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2844 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2845 _mm_mul_ps(onefourth_SSE,
2846 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2847 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2849 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2850 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2851 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2852 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2853 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2854 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2855 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2856 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2857 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2858 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2859 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2860 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2861 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2862 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2863 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2864 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2865 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2866 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2867 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2868 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2869 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2870 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2871 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2872 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2873 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2874 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2875 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2876 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2877 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2878 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2879 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2880 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2881 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2882 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2884 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2885 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2886 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2888 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2889 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2890 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2892 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2893 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2894 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2896 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2898 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2899 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2900 _mm_add_ps(t2_SSE0, t3_SSE0)));
2901 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2902 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2903 _mm_add_ps(t2_SSE1, t3_SSE1)));
2904 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2905 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2906 _mm_add_ps(t2_SSE2, t3_SSE2)));
2907 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2908 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2909 _mm_add_ps(t2_SSE3, t3_SSE3)));
2911 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2913 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2915 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2917 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2920 /* Evaluate influence of atom ai -> aj */
2921 t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2922 t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2923 t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2924 t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2925 t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2926 t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2927 t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2928 t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2929 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2930 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2931 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2932 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2934 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2935 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2936 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2937 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2938 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2939 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2940 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2941 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2942 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2943 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2944 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2945 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2946 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2947 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2948 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2949 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2951 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2952 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2953 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2954 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2955 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2956 _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2957 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2958 _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2959 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2960 _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2961 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2962 _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2963 dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2964 dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2965 dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2966 dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2968 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2969 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2970 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2971 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2972 uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2973 uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2974 uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2975 uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2976 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2977 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2978 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2979 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2980 lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2981 lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2982 lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2983 lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2985 diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2986 diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2987 diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2988 diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2989 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2990 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2991 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2992 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2993 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2994 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2995 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2996 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2997 prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2998 prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2999 prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
3000 prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
3002 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
3003 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
3004 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
3005 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
3006 t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
3007 t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
3008 t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
3009 t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
3010 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
3011 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
3013 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
3014 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
3016 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
3017 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
3019 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
3020 _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
3022 t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
3023 t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
3024 t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
3025 t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
3026 t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
3027 t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
3028 t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
3029 t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
3030 t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
3031 t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
3032 t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
3033 t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
3034 t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
3035 t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
3036 t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
3037 t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
3038 t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
3039 t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
3040 t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
3041 t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
3043 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3044 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
3045 _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
3046 _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
3047 _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
3049 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
3050 _mm_mul_ps(prod_SSE0, lij3_SSE0));
3051 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
3052 _mm_mul_ps(prod_SSE1, lij3_SSE1));
3053 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
3054 _mm_mul_ps(prod_SSE2, lij3_SSE2));
3055 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
3056 _mm_mul_ps(prod_SSE3, lij3_SSE3));
3057 t1_SSE0 = _mm_sub_ps(t1_SSE0,
3058 _mm_mul_ps(onefourth_SSE,
3059 _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
3060 _mm_mul_ps(lij3_SSE0, dr_SSE0))));
3061 t1_SSE1 = _mm_sub_ps(t1_SSE1,
3062 _mm_mul_ps(onefourth_SSE,
3063 _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
3064 _mm_mul_ps(lij3_SSE1, dr_SSE1))));
3065 t1_SSE2 = _mm_sub_ps(t1_SSE2,
3066 _mm_mul_ps(onefourth_SSE,
3067 _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
3068 _mm_mul_ps(lij3_SSE2, dr_SSE2))));
3069 t1_SSE3 = _mm_sub_ps(t1_SSE3,
3070 _mm_mul_ps(onefourth_SSE,
3071 _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
3072 _mm_mul_ps(lij3_SSE3, dr_SSE3))));
3073 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
3074 _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
3075 _mm_mul_ps(uij3_SSE0, dr_SSE0)));
3076 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
3077 _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
3078 _mm_mul_ps(uij3_SSE1, dr_SSE1)));
3079 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
3080 _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
3081 _mm_mul_ps(uij3_SSE2, dr_SSE2)));
3082 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
3083 _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
3084 _mm_mul_ps(uij3_SSE3, dr_SSE3)));
3085 t2_SSE0 = _mm_sub_ps(t2_SSE0,
3086 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
3087 _mm_mul_ps(prod_SSE0, uij3_SSE0)));
3088 t2_SSE1 = _mm_sub_ps(t2_SSE1,
3089 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
3090 _mm_mul_ps(prod_SSE1, uij3_SSE1)));
3091 t2_SSE2 = _mm_sub_ps(t2_SSE2,
3092 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
3093 _mm_mul_ps(prod_SSE2, uij3_SSE2)));
3094 t2_SSE3 = _mm_sub_ps(t2_SSE3,
3095 _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
3096 _mm_mul_ps(prod_SSE3, uij3_SSE3)));
3098 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
3099 _mm_mul_ps(rinv_SSE0, rinv_SSE0));
3100 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
3101 _mm_mul_ps(rinv_SSE1, rinv_SSE1));
3102 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
3103 _mm_mul_ps(rinv_SSE2, rinv_SSE2));
3104 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
3105 _mm_mul_ps(rinv_SSE3, rinv_SSE3));
3107 t3_SSE0 = _mm_sub_ps(t3_SSE0,
3108 _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
3110 _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
3111 t3_SSE1 = _mm_sub_ps(t3_SSE1,
3112 _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
3114 _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
3115 t3_SSE2 = _mm_sub_ps(t3_SSE2,
3116 _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
3118 _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
3119 t3_SSE3 = _mm_sub_ps(t3_SSE3,
3120 _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
3122 _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
3125 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
3126 _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
3127 _mm_add_ps(t2_SSE0, t3_SSE0)));
3128 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
3129 _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
3130 _mm_add_ps(t2_SSE1, t3_SSE1)));
3131 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
3132 _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
3133 _mm_add_ps(t2_SSE2, t3_SSE2)));
3134 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
3135 _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
3136 _mm_add_ps(t2_SSE3, t3_SSE3)));
3138 _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
3140 _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
3142 _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
3144 _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
3147 _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
3148 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
3149 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
3150 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
3151 _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3155 for (i = 0; i < natoms/2+1; i++)
3157 work[i] += work[natoms+i];
3160 /* Parallel summations would go here if ever implemented with DD */
3162 if (gb_algorithm == egbHCT)
3165 for (i = 0; i < natoms; i++)
3167 if (born->use[i] != 0)
3169 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
3170 sum_ai = 1.0/rai - work[i];
3171 min_rad = rai + born->gb_doffset;
3174 born->bRad[i] = rad > min_rad ? rad : min_rad;
3175 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3184 /* Calculate the radii */
3185 for (i = 0; i < natoms; i++)
3188 if (born->use[i] != 0)
3190 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3192 rai = rai-born->gb_doffset;
3194 sum_ai = rai * work[i];
3195 sum_ai2 = sum_ai * sum_ai;
3196 sum_ai3 = sum_ai2 * sum_ai;
3198 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3199 born->bRad[i] = rai_inv - tsum*rai_inv2;
3200 born->bRad[i] = 1.0 / born->bRad[i];
3202 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3204 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3205 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3221 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec * fr,
3222 t_mdatoms * mdatoms,
3223 gmx_genborn_t * born,
3229 gmx_allvsallgb2_data_t *aadata;
3232 int nj0, nj1, nj2, nj3;
3245 real rbai, rbaj, fgb, fgb_ai, rbi;
3256 __m128 jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
3257 __m128 ix_SSE0, iy_SSE0, iz_SSE0;
3258 __m128 ix_SSE1, iy_SSE1, iz_SSE1;
3259 __m128 ix_SSE2, iy_SSE2, iz_SSE2;
3260 __m128 ix_SSE3, iy_SSE3, iz_SSE3;
3261 __m128 fix_SSE0, fiy_SSE0, fiz_SSE0;
3262 __m128 fix_SSE1, fiy_SSE1, fiz_SSE1;
3263 __m128 fix_SSE2, fiy_SSE2, fiz_SSE2;
3264 __m128 fix_SSE3, fiy_SSE3, fiz_SSE3;
3265 __m128 rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
3266 __m128 imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
3267 __m128 jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
3268 __m128 dx_SSE0, dy_SSE0, dz_SSE0;
3269 __m128 dx_SSE1, dy_SSE1, dz_SSE1;
3270 __m128 dx_SSE2, dy_SSE2, dz_SSE2;
3271 __m128 dx_SSE3, dy_SSE3, dz_SSE3;
3272 __m128 fgb_SSE0, fgb_ai_SSE0;
3273 __m128 fgb_SSE1, fgb_ai_SSE1;
3274 __m128 fgb_SSE2, fgb_ai_SSE2;
3275 __m128 fgb_SSE3, fgb_ai_SSE3;
3276 __m128 tx_SSE0, ty_SSE0, tz_SSE0;
3277 __m128 tx_SSE1, ty_SSE1, tz_SSE1;
3278 __m128 tx_SSE2, ty_SSE2, tz_SSE2;
3279 __m128 tx_SSE3, ty_SSE3, tz_SSE3;
3282 natoms = mdatoms->nr;
3284 ni1 = mdatoms->homenr;
3287 aadata = (gmx_allvsallgb2_data_t *)paadata;
3289 x_align = aadata->x_align;
3290 y_align = aadata->y_align;
3291 z_align = aadata->z_align;
3292 fx_align = aadata->fx_align;
3293 fy_align = aadata->fy_align;
3294 fz_align = aadata->fz_align;
3296 jindex = aadata->jindex_gb;
3302 /* Loop to get the proper form for the Born radius term */
3303 if (gb_algorithm == egbSTILL)
3305 for (i = 0; i < natoms; i++)
3307 rbi = born->bRad[i];
3308 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3311 else if (gb_algorithm == egbHCT)
3313 for (i = 0; i < natoms; i++)
3315 rbi = born->bRad[i];
3316 rb[i] = rbi * rbi * fr->dvda[i];
3319 else if (gb_algorithm == egbOBC)
3321 for (idx = 0; idx < natoms; idx++)
3323 rbi = born->bRad[idx];
3324 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3328 for (i = 0; i < 2*natoms; i++)
3336 for (i = 0; i < natoms; i++)
3338 rb[i+natoms] = rb[i];
3341 for (i = ni0; i < ni1; i += UNROLLI)
3343 /* We assume shifts are NOT used for all-vs-all interactions */
3345 /* Load i atom data */
3346 ix_SSE0 = _mm_load1_ps(x_align+i);
3347 iy_SSE0 = _mm_load1_ps(y_align+i);
3348 iz_SSE0 = _mm_load1_ps(z_align+i);
3349 ix_SSE1 = _mm_load1_ps(x_align+i+1);
3350 iy_SSE1 = _mm_load1_ps(y_align+i+1);
3351 iz_SSE1 = _mm_load1_ps(z_align+i+1);
3352 ix_SSE2 = _mm_load1_ps(x_align+i+2);
3353 iy_SSE2 = _mm_load1_ps(y_align+i+2);
3354 iz_SSE2 = _mm_load1_ps(z_align+i+2);
3355 ix_SSE3 = _mm_load1_ps(x_align+i+3);
3356 iy_SSE3 = _mm_load1_ps(y_align+i+3);
3357 iz_SSE3 = _mm_load1_ps(z_align+i+3);
3359 fix_SSE0 = _mm_setzero_ps();
3360 fiy_SSE0 = _mm_setzero_ps();
3361 fiz_SSE0 = _mm_setzero_ps();
3362 fix_SSE1 = _mm_setzero_ps();
3363 fiy_SSE1 = _mm_setzero_ps();
3364 fiz_SSE1 = _mm_setzero_ps();
3365 fix_SSE2 = _mm_setzero_ps();
3366 fiy_SSE2 = _mm_setzero_ps();
3367 fiz_SSE2 = _mm_setzero_ps();
3368 fix_SSE3 = _mm_setzero_ps();
3369 fiy_SSE3 = _mm_setzero_ps();
3370 fiz_SSE3 = _mm_setzero_ps();
3372 rbai_SSE0 = _mm_load1_ps(rb+i);
3373 rbai_SSE1 = _mm_load1_ps(rb+i+1);
3374 rbai_SSE2 = _mm_load1_ps(rb+i+2);
3375 rbai_SSE3 = _mm_load1_ps(rb+i+3);
3377 /* Load limits for loop over neighbors */
3379 nj3 = jindex[4*i+3];
3381 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3382 for (j = nj0; j < nj3; j += UNROLLJ)
3384 /* load j atom coordinates */
3385 jx_SSE = _mm_load_ps(x_align+j);
3386 jy_SSE = _mm_load_ps(y_align+j);
3387 jz_SSE = _mm_load_ps(z_align+j);
3389 /* Calculate distance */
3390 dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
3391 dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
3392 dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
3393 dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
3394 dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
3395 dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
3396 dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
3397 dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
3398 dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
3399 dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
3400 dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
3401 dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
3403 rbaj_SSE = _mm_load_ps(rb+j);
3405 fgb_SSE0 = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
3407 fgb_SSE1 = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
3409 fgb_SSE2 = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
3411 fgb_SSE3 = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
3414 fgb_ai_SSE0 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3416 fgb_ai_SSE1 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3418 fgb_ai_SSE2 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3420 fgb_ai_SSE3 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3423 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3424 fgb_SSE0 = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
3425 fgb_SSE1 = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
3426 fgb_SSE2 = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
3427 fgb_SSE3 = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
3429 /* Calculate temporary vectorial force */
3430 tx_SSE0 = _mm_mul_ps(fgb_SSE0, dx_SSE0);
3431 ty_SSE0 = _mm_mul_ps(fgb_SSE0, dy_SSE0);
3432 tz_SSE0 = _mm_mul_ps(fgb_SSE0, dz_SSE0);
3433 tx_SSE1 = _mm_mul_ps(fgb_SSE1, dx_SSE1);
3434 ty_SSE1 = _mm_mul_ps(fgb_SSE1, dy_SSE1);
3435 tz_SSE1 = _mm_mul_ps(fgb_SSE1, dz_SSE1);
3436 tx_SSE2 = _mm_mul_ps(fgb_SSE2, dx_SSE2);
3437 ty_SSE2 = _mm_mul_ps(fgb_SSE2, dy_SSE2);
3438 tz_SSE2 = _mm_mul_ps(fgb_SSE2, dz_SSE2);
3439 tx_SSE3 = _mm_mul_ps(fgb_SSE3, dx_SSE3);
3440 ty_SSE3 = _mm_mul_ps(fgb_SSE3, dy_SSE3);
3441 tz_SSE3 = _mm_mul_ps(fgb_SSE3, dz_SSE3);
3443 /* Increment i atom force */
3444 fix_SSE0 = _mm_add_ps(fix_SSE0, tx_SSE0);
3445 fiy_SSE0 = _mm_add_ps(fiy_SSE0, ty_SSE0);
3446 fiz_SSE0 = _mm_add_ps(fiz_SSE0, tz_SSE0);
3447 fix_SSE1 = _mm_add_ps(fix_SSE1, tx_SSE1);
3448 fiy_SSE1 = _mm_add_ps(fiy_SSE1, ty_SSE1);
3449 fiz_SSE1 = _mm_add_ps(fiz_SSE1, tz_SSE1);
3450 fix_SSE2 = _mm_add_ps(fix_SSE2, tx_SSE2);
3451 fiy_SSE2 = _mm_add_ps(fiy_SSE2, ty_SSE2);
3452 fiz_SSE2 = _mm_add_ps(fiz_SSE2, tz_SSE2);
3453 fix_SSE3 = _mm_add_ps(fix_SSE3, tx_SSE3);
3454 fiy_SSE3 = _mm_add_ps(fiy_SSE3, ty_SSE3);
3455 fiz_SSE3 = _mm_add_ps(fiz_SSE3, tz_SSE3);
3457 /* Decrement j atom force */
3458 _mm_store_ps(fx_align+j,
3459 _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
3460 _mm_store_ps(fy_align+j,
3461 _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
3462 _mm_store_ps(fz_align+j,
3463 _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
3465 /* Add i forces to mem and shifted force list */
3466 _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
3467 fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
3468 fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
3469 fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
3470 _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3472 _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
3473 fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
3474 fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
3475 fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
3476 _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3478 _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
3479 fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
3480 fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
3481 fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
3482 _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
3485 for (i = 0; i < natoms; i++)
3487 f[3*i] += fx_align[i] + fx_align[natoms+i];
3488 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
3489 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
3496 /* dummy variable when not using SSE */
3497 int genborn_allvsall_sse2_single_dummy;