From: Erik Lindahl Date: Tue, 7 Jul 2015 15:02:36 +0000 (+0200) Subject: Cleanup and remove unused SSE2 generalized born code X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=b5c083e561c2786759540227e8905e5456ab7ef6 Cleanup and remove unused SSE2 generalized born code This code has been disabled for quite a while due to a bug. Since we should anyway move to verlet-style kernels there is no point in keeping these files around. Change-Id: Idfd65ac2d0d9f304d548c97e4dbabbaf72df7a7b --- diff --git a/src/gromacs/mdlib/genborn.c b/src/gromacs/mdlib/genborn.c index a4f34b8ad5..dcfee25ed2 100644 --- a/src/gromacs/mdlib/genborn.c +++ b/src/gromacs/mdlib/genborn.c @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2008, The GROMACS development team. - * Copyright (c) 2013,2014, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -51,6 +51,7 @@ #include "gromacs/legacyheaders/types/commrec.h" #include "gromacs/math/units.h" #include "gromacs/math/vec.h" +#include "gromacs/mdlib/genborn_allvsall.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/pbcutil/mshift.h" #include "gromacs/pbcutil/pbc.h" @@ -59,19 +60,6 @@ #include "gromacs/utility/gmxmpi.h" #include "gromacs/utility/smalloc.h" -#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER -# ifdef GMX_DOUBLE -# include "gromacs/mdlib/genborn_allvsall_sse2_double.h" -# include "gromacs/mdlib/genborn_sse2_double.h" -# else -# include "gromacs/mdlib/genborn_allvsall_sse2_single.h" -# include "gromacs/mdlib/genborn_sse2_single.h" -# endif /* GMX_DOUBLE */ -#endif /* SSE or AVX present */ - -#include "gromacs/mdlib/genborn_allvsall.h" - -/*#define DISABLE_SSE*/ typedef struct { int shift; @@ -978,43 +966,13 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t if (ir->gb_algorithm == egbSTILL) { -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - if (fr->use_simd_kernels) - { -# ifdef GMX_DOUBLE - genborn_allvsall_calc_still_radii_sse2_double(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb); -# else - genborn_allvsall_calc_still_radii_sse2_single(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb); -# endif - } - else - { - genborn_allvsall_calc_still_radii(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb); - } -#else genborn_allvsall_calc_still_radii(fr, md, born, top, x[0], &fr->AllvsAll_workgb); -#endif /* 13 flops in outer loop, 47 flops in inner loop */ inc_nrnb(nrnb, eNR_BORN_AVA_RADII_STILL, md->homenr*13+cnt*47); } else if (ir->gb_algorithm == egbHCT || ir->gb_algorithm == egbOBC) { -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - if (fr->use_simd_kernels) - { -# ifdef GMX_DOUBLE - genborn_allvsall_calc_hct_obc_radii_sse2_double(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb); -# else - genborn_allvsall_calc_hct_obc_radii_sse2_single(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb); -# endif - } - else - { - genborn_allvsall_calc_hct_obc_radii(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb); - } -#else genborn_allvsall_calc_hct_obc_radii(fr, md, born, ir->gb_algorithm, top, x[0], &fr->AllvsAll_workgb); -#endif /* 24 flops in outer loop, 183 in inner */ inc_nrnb(nrnb, eNR_BORN_AVA_RADII_HCT_OBC, md->homenr*24+cnt*183); } @@ -1028,45 +986,6 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t /* Switch for determining which algorithm to use for Born radii calculation */ #ifdef GMX_DOUBLE -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */ - switch (ir->gb_algorithm) - { - case egbSTILL: - if (fr->use_simd_kernels) - { - calc_gb_rad_still_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born); - } - else - { - calc_gb_rad_still(cr, fr, top, x, nl, born, md); - } - break; - case egbHCT: - if (fr->use_simd_kernels) - { - calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm); - } - else - { - calc_gb_rad_hct(cr, fr, top, x, nl, born, md); - } - break; - case egbOBC: - if (fr->use_simd_kernels) - { - calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm); - } - else - { - calc_gb_rad_obc(cr, fr, born->nr, top, x, nl, born, md); - } - break; - - default: - gmx_fatal(FARGS, "Unknown double precision sse-enabled algorithm for Born radii calculation: %d", ir->gb_algorithm); - } -#else switch (ir->gb_algorithm) { case egbSTILL: @@ -1083,51 +1002,8 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t gmx_fatal(FARGS, "Unknown double precision algorithm for Born radii calculation: %d", ir->gb_algorithm); } -#endif - #else -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */ - switch (ir->gb_algorithm) - { - case egbSTILL: - if (fr->use_simd_kernels) - { - calc_gb_rad_still_sse2_single(cr, fr, born->nr, top, x[0], nl, born); - } - else - { - calc_gb_rad_still(cr, fr, top, x, nl, born, md); - } - break; - case egbHCT: - if (fr->use_simd_kernels) - { - calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm); - } - else - { - calc_gb_rad_hct(cr, fr, top, x, nl, born, md); - } - break; - - case egbOBC: - if (fr->use_simd_kernels) - { - calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm); - } - else - { - calc_gb_rad_obc(cr, fr, born->nr, top, x, nl, born, md); - } - break; - - default: - gmx_fatal(FARGS, "Unknown sse-enabled algorithm for Born radii calculation: %d", ir->gb_algorithm); - } - -#else switch (ir->gb_algorithm) { case egbSTILL: @@ -1144,8 +1020,6 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t gmx_fatal(FARGS, "Unknown algorithm for Born radii calculation: %d", ir->gb_algorithm); } -#endif /* Single precision sse */ - #endif /* Double or single precision */ if (fr->bAllvsAll == FALSE) @@ -1530,48 +1404,15 @@ calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t if (fr->bAllvsAll) { -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - if (fr->use_simd_kernels) - { -# ifdef GMX_DOUBLE - genborn_allvsall_calc_chainrule_sse2_double(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb); -# else - genborn_allvsall_calc_chainrule_sse2_single(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb); -# endif - } - else - { - genborn_allvsall_calc_chainrule(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb); - } -#else genborn_allvsall_calc_chainrule(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb); -#endif cnt = md->homenr*(md->nr/2+1); /* 9 flops for outer loop, 15 for inner */ inc_nrnb(nrnb, eNR_BORN_AVA_CHAINRULE, md->homenr*9+cnt*15); return; } -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - if (fr->use_simd_kernels) - { -# ifdef GMX_DOUBLE - calc_gb_chainrule_sse2_double(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x[0], - f[0], fr->fshift[0], fr->shift_vec[0], gb_algorithm, born, md); -# else - calc_gb_chainrule_sse2_single(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x[0], - f[0], fr->fshift[0], fr->shift_vec[0], gb_algorithm, born, md); -# endif - } - else - { - calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, - x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md); - } -#else calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x, f, fr->fshift, fr->shift_vec, gb_algorithm, born); -#endif if (!fr->bAllvsAll) { diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_double.c b/src/gromacs/mdlib/genborn_allvsall_sse2_double.c deleted file mode 100644 index 5847525465..0000000000 --- a/src/gromacs/mdlib/genborn_allvsall_sse2_double.c +++ /dev/null @@ -1,2506 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2009, The GROMACS Development Team. - * Copyright (c) 2012,2014, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#include "gmxpre.h" - -#include - -#include "gromacs/legacyheaders/genborn.h" -#include "gromacs/legacyheaders/network.h" -#include "gromacs/legacyheaders/types/simple.h" -#include "gromacs/math/units.h" -#include "gromacs/math/vec.h" -#include "gromacs/mdlib/genborn_allvsall.h" -#include "gromacs/utility/smalloc.h" - - -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - -#include - - -#define SIMD_WIDTH 2 -#define UNROLLI 2 -#define UNROLLJ 2 - - - - - - - - - -typedef struct -{ - int * jindex_gb; - int ** prologue_mask_gb; - int ** epilogue_mask; - int * imask; - double * gb_radius; - double * workparam; - double * work; - double * x_align; - double * y_align; - double * z_align; - double * fx_align; - double * fy_align; - double * fz_align; -} -gmx_allvsallgb2_data_t; - - -static int -calc_maxoffset(int i, int natoms) -{ - int maxoffset; - - if ((natoms % 2) == 1) - { - /* Odd number of atoms, easy */ - maxoffset = natoms/2; - } - else if ((natoms % 4) == 0) - { - /* Multiple of four is hard */ - if (i < natoms/2) - { - if ((i % 2) == 0) - { - maxoffset = natoms/2; - } - else - { - maxoffset = natoms/2-1; - } - } - else - { - if ((i % 2) == 1) - { - maxoffset = natoms/2; - } - else - { - maxoffset = natoms/2-1; - } - } - } - else - { - /* natoms/2 = odd */ - if ((i % 2) == 0) - { - maxoffset = natoms/2; - } - else - { - maxoffset = natoms/2-1; - } - } - - return maxoffset; -} - -static void -setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata, - t_ilist * ilist, - int start, - int end, - int natoms, - gmx_bool bInclude12, - gmx_bool bInclude13, - gmx_bool bInclude14) -{ - int i, j, k, tp; - int a1, a2; - int ni0, ni1, nj0, nj1, nj; - int imin, imax, iexcl; - int max_offset; - int max_excl_offset; - int firstinteraction; - int ibase; - int *pi; - - /* This routine can appear to be a bit complex, but it is mostly book-keeping. - * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates - * whether they should interact or not. - * - * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction - * should be present, otherwise 0. Since exclusions typically only occur when i & j are close, - * we create a jindex array with three elements per i atom: the starting point, the point to - * which we need to check exclusions, and the end point. - * This way we only have to allocate a short exclusion mask per i atom. - */ - - ni0 = (start/UNROLLI)*UNROLLI; - ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI; - - /* Set the interaction mask to only enable the i atoms we want to include */ - snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH)); - aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15))); - for (i = 0; i < natoms+UNROLLI; i++) - { - aadata->imask[2*i] = (i >= start && i < end) ? 0xFFFFFFFF : 0; - aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0; - } - - /* Allocate memory for our modified jindex array */ - snew(aadata->jindex_gb, 4*(natoms+UNROLLI)); - for (i = 0; i < 4*(natoms+UNROLLI); i++) - { - aadata->jindex_gb[i] = 0; - } - - /* Create the exclusion masks for the prologue part */ - snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */ - - /* First zero everything to avoid uninitialized data */ - for (i = 0; i < natoms+UNROLLI; i++) - { - aadata->prologue_mask_gb[i] = NULL; - } - - /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */ - for (ibase = ni0; ibase < ni1; ibase += UNROLLI) - { - max_excl_offset = -1; - - /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */ - imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end; - - /* Which atom is the first we (might) interact with? */ - imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */ - for (i = ibase; i < imax; i++) - { - /* Before exclusions, which atom is the first we (might) interact with? */ - firstinteraction = i+1; - max_offset = calc_maxoffset(i, natoms); - - if (!bInclude12) - { - for (j = 0; j < ilist[F_GB12].nr; j += 3) - { - a1 = ilist[F_GB12].iatoms[j+1]; - a2 = ilist[F_GB12].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k == firstinteraction) - { - firstinteraction++; - } - } - } - if (!bInclude13) - { - for (j = 0; j < ilist[F_GB13].nr; j += 3) - { - a1 = ilist[F_GB13].iatoms[j+1]; - a2 = ilist[F_GB13].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k == firstinteraction) - { - firstinteraction++; - } - } - } - if (!bInclude14) - { - for (j = 0; j < ilist[F_GB14].nr; j += 3) - { - a1 = ilist[F_GB14].iatoms[j+1]; - a2 = ilist[F_GB14].iatoms[j+2]; - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k == firstinteraction) - { - firstinteraction++; - } - } - } - imin = (firstinteraction < imin) ? firstinteraction : imin; - } - /* round down to j unrolling factor */ - imin = (imin/UNROLLJ)*UNROLLJ; - - for (i = ibase; i < imax; i++) - { - max_offset = calc_maxoffset(i, natoms); - - if (!bInclude12) - { - for (j = 0; j < ilist[F_GB12].nr; j += 3) - { - a1 = ilist[F_GB12].iatoms[j+1]; - a2 = ilist[F_GB12].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k < imin) - { - k += natoms; - } - - if (k > i+max_offset) - { - continue; - } - - k = k - imin; - - if (k+natoms <= max_offset) - { - k += natoms; - } - max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset; - } - } - if (!bInclude13) - { - for (j = 0; j < ilist[F_GB13].nr; j += 3) - { - a1 = ilist[F_GB13].iatoms[j+1]; - a2 = ilist[F_GB13].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k < imin) - { - k += natoms; - } - - if (k > i+max_offset) - { - continue; - } - - k = k - imin; - - if (k+natoms <= max_offset) - { - k += natoms; - } - max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset; - } - } - if (!bInclude14) - { - for (j = 0; j < ilist[F_GB14].nr; j += 3) - { - a1 = ilist[F_GB14].iatoms[j+1]; - a2 = ilist[F_GB14].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k < imin) - { - k += natoms; - } - - if (k > i+max_offset) - { - continue; - } - - k = k - imin; - - if (k+natoms <= max_offset) - { - k += natoms; - } - max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset; - } - } - } - - /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */ - max_excl_offset++; - /* round up to j unrolling factor */ - max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ; - - /* Set all the prologue masks length to this value (even for i>end) */ - for (i = ibase; i < ibase+UNROLLI; i++) - { - aadata->jindex_gb[4*i] = imin; - aadata->jindex_gb[4*i+1] = imin+max_excl_offset; - } - } - - /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */ - for (ibase = ni0; ibase < ni1; ibase += UNROLLI) - { - for (i = ibase; i < ibase+UNROLLI; i++) - { - nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i]; - imin = aadata->jindex_gb[4*i]; - - /* Allocate aligned memory */ - snew(pi, 2*(nj+2*SIMD_WIDTH)); - aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15))); - - max_offset = calc_maxoffset(i, natoms); - - /* Include interactions i+1 <= j < i+maxoffset */ - for (k = 0; k < nj; k++) - { - j = imin + k; - - if ( (j > i) && (j <= i+max_offset) ) - { - aadata->prologue_mask_gb[i][2*k] = 0xFFFFFFFF; - aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF; - } - else - { - aadata->prologue_mask_gb[i][2*k] = 0; - aadata->prologue_mask_gb[i][2*k+1] = 0; - } - } - - /* Clear out the explicit exclusions */ - if (i < end) - { - if (!bInclude12) - { - for (j = 0; j < ilist[F_GB12].nr; j += 3) - { - a1 = ilist[F_GB12].iatoms[j+1]; - a2 = ilist[F_GB12].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k > i+max_offset) - { - continue; - } - k = k-i; - - if (k+natoms <= max_offset) - { - k += natoms; - } - - k = k+i-imin; - if (k >= 0) - { - aadata->prologue_mask_gb[i][2*k] = 0; - aadata->prologue_mask_gb[i][2*k+1] = 0; - } - } - } - if (!bInclude13) - { - for (j = 0; j < ilist[F_GB13].nr; j += 3) - { - a1 = ilist[F_GB13].iatoms[j+1]; - a2 = ilist[F_GB13].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k > i+max_offset) - { - continue; - } - k = k-i; - - if (k+natoms <= max_offset) - { - k += natoms; - } - - k = k+i-imin; - if (k >= 0) - { - aadata->prologue_mask_gb[i][2*k] = 0; - aadata->prologue_mask_gb[i][2*k+1] = 0; - } - } - } - if (!bInclude14) - { - for (j = 0; j < ilist[F_GB14].nr; j += 3) - { - a1 = ilist[F_GB14].iatoms[j+1]; - a2 = ilist[F_GB14].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k > i+max_offset) - { - continue; - } - k = k-i; - - if (k+natoms <= max_offset) - { - k += natoms; - } - - k = k+i-imin; - if (k >= 0) - { - aadata->prologue_mask_gb[i][2*k] = 0; - aadata->prologue_mask_gb[i][2*k+1] = 0; - } - } - } - } - } - } - - /* Construct the epilogue mask - this just contains the check for maxoffset */ - snew(aadata->epilogue_mask, natoms+UNROLLI); - - /* First zero everything to avoid uninitialized data */ - for (i = 0; i < natoms+UNROLLI; i++) - { - aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1]; - aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1]; - aadata->epilogue_mask[i] = NULL; - } - - for (ibase = ni0; ibase < ni1; ibase += UNROLLI) - { - /* Find the lowest index for which we need to use the epilogue */ - imin = ibase; - max_offset = calc_maxoffset(imin, natoms); - - imin = imin + 1 + max_offset; - - /* Find largest index for which we need to use the epilogue */ - imax = ibase + UNROLLI-1; - imax = (imax < end) ? imax : end; - - max_offset = calc_maxoffset(imax, natoms); - imax = imax + 1 + max_offset + UNROLLJ - 1; - - for (i = ibase; i < ibase+UNROLLI; i++) - { - /* Start of epilogue - round down to j tile limit */ - aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ; - /* Make sure we dont overlap - for small systems everything is done in the prologue */ - aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2]; - /* Round upwards to j tile limit */ - aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ; - /* Make sure we dont have a negative range for the epilogue */ - aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3]; - } - } - - /* And fill it with data... */ - - for (ibase = ni0; ibase < ni1; ibase += UNROLLI) - { - for (i = ibase; i < ibase+UNROLLI; i++) - { - - nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2]; - - /* Allocate aligned memory */ - snew(pi, 2*(nj+2*SIMD_WIDTH)); - aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15))); - - max_offset = calc_maxoffset(i, natoms); - - for (k = 0; k < nj; k++) - { - j = aadata->jindex_gb[4*i+2] + k; - aadata->epilogue_mask[i][2*k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0; - aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0; - } - } - } -} - - -static void -genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata, - gmx_localtop_t * top, - gmx_genborn_t * born, - t_mdatoms * mdatoms, - double radius_offset, - int gb_algorithm, - gmx_bool bInclude12, - gmx_bool bInclude13, - gmx_bool bInclude14) -{ - int i, j, idx; - int natoms; - gmx_allvsallgb2_data_t *aadata; - double *p; - - natoms = mdatoms->nr; - - snew(aadata, 1); - *p_aadata = aadata; - - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15))); - - snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH); - aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15))); - - snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH); - aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15))); - - snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH); - aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15))); - - for (i = 0; i < mdatoms->nr; i++) - { - aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset; - if (gb_algorithm == egbSTILL) - { - aadata->workparam[i] = born->vsolv[i]; - } - else if (gb_algorithm == egbOBC) - { - aadata->workparam[i] = born->param[i]; - } - aadata->work[i] = 0.0; - } - for (i = 0; i < mdatoms->nr; i++) - { - aadata->gb_radius[natoms+i] = aadata->gb_radius[i]; - aadata->workparam[natoms+i] = aadata->workparam[i]; - aadata->work[natoms+i] = aadata->work[i]; - } - - for (i = 0; i < 2*natoms+SIMD_WIDTH; i++) - { - aadata->x_align[i] = 0.0; - aadata->y_align[i] = 0.0; - aadata->z_align[i] = 0.0; - aadata->fx_align[i] = 0.0; - aadata->fy_align[i] = 0.0; - aadata->fz_align[i] = 0.0; - } - - setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr, - bInclude12, bInclude13, bInclude14); -} - - -/* - * This routine apparently hits a compiler bug visual studio has had 'forever'. - * It is present both in VS2005 and VS2008, and the only way around it is to - * decrease optimization. We do that with at pragma, and only for MSVC, so it - * will not hurt any of the well-behaving and supported compilers out there. - * MS: Fix your compiler, it sucks like a black hole! - */ -#ifdef _MSC_VER -#pragma optimize("t",off) -#endif - -int -genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - gmx_localtop_t * top, - double * x, - t_commrec * cr, - void * paadata) -{ - gmx_allvsallgb2_data_t *aadata; - int natoms; - int ni0, ni1; - int nj0, nj1, nj2, nj3; - int i, j, k, n; - int * mask; - int * pmask0; - int * pmask1; - int * emask0; - int * emask1; - double ix, iy, iz; - double jx, jy, jz; - double dx, dy, dz; - double rsq, rinv; - double gpi, rai, vai; - double prod_ai; - double irsq, idr4, idr6; - double raj, rvdw, ratio; - double vaj, ccf, dccf, theta, cosq; - double term, prod, icf4, icf6, gpi2, factor, sinq; - double * gb_radius; - double * vsolv; - double * work; - double tmpsum[2]; - double * x_align; - double * y_align; - double * z_align; - int * jindex; - double * dadx; - - __m128d ix_SSE0, iy_SSE0, iz_SSE0; - __m128d ix_SSE1, iy_SSE1, iz_SSE1; - __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0; - __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1; - __m128d imask_SSE0, jmask_SSE0; - __m128d imask_SSE1, jmask_SSE1; - __m128d jx_SSE, jy_SSE, jz_SSE; - __m128d dx_SSE0, dy_SSE0, dz_SSE0; - __m128d dx_SSE1, dy_SSE1, dz_SSE1; - __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0; - __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1; - __m128d raj_SSE, vaj_SSE, prod_SSE; - __m128d rvdw_SSE0, ratio_SSE0; - __m128d rvdw_SSE1, ratio_SSE1; - __m128d theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0; - __m128d theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1; - __m128d ccf_SSE0, dccf_SSE0; - __m128d ccf_SSE1, dccf_SSE1; - __m128d icf4_SSE0, icf6_SSE0; - __m128d icf4_SSE1, icf6_SSE1; - __m128d half_SSE, one_SSE, two_SSE, four_SSE; - __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE; - - natoms = mdatoms->nr; - ni0 = 0; - ni1 = mdatoms->homenr; - - n = 0; - - aadata = *((gmx_allvsallgb2_data_t **)paadata); - - - if (aadata == NULL) - { - genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0, - egbSTILL, FALSE, FALSE, TRUE); - *((gmx_allvsallgb2_data_t **)paadata) = aadata; - } - - x_align = aadata->x_align; - y_align = aadata->y_align; - z_align = aadata->z_align; - - gb_radius = aadata->gb_radius; - vsolv = aadata->workparam; - work = aadata->work; - jindex = aadata->jindex_gb; - dadx = fr->dadx; - - still_p4_SSE = _mm_set1_pd(STILL_P4); - still_p5inv_SSE = _mm_set1_pd(STILL_P5INV); - still_pip5_SSE = _mm_set1_pd(STILL_PIP5); - half_SSE = _mm_set1_pd(0.5); - one_SSE = _mm_set1_pd(1.0); - two_SSE = _mm_set1_pd(2.0); - four_SSE = _mm_set1_pd(4.0); - - /* This will be summed, so it has to extend to natoms + buffer */ - for (i = 0; i < natoms+1+natoms/2; i++) - { - work[i] = 0; - } - - for (i = ni0; i < ni1+1+natoms/2; i++) - { - k = i%natoms; - x_align[i] = x[3*k]; - y_align[i] = x[3*k+1]; - z_align[i] = x[3*k+2]; - work[i] = 0; - } - - for (i = ni0; i < ni1; i += UNROLLI) - { - /* We assume shifts are NOT used for all-vs-all interactions */ - /* Load i atom data */ - ix_SSE0 = _mm_load1_pd(x_align+i); - iy_SSE0 = _mm_load1_pd(y_align+i); - iz_SSE0 = _mm_load1_pd(z_align+i); - ix_SSE1 = _mm_load1_pd(x_align+i+1); - iy_SSE1 = _mm_load1_pd(y_align+i+1); - iz_SSE1 = _mm_load1_pd(z_align+i+1); - - gpi_SSE0 = _mm_setzero_pd(); - gpi_SSE1 = _mm_setzero_pd(); - - rai_SSE0 = _mm_load1_pd(gb_radius+i); - rai_SSE1 = _mm_load1_pd(gb_radius+i+1); - - prod_ai_SSE0 = _mm_set1_pd(STILL_P4*vsolv[i]); - prod_ai_SSE1 = _mm_set1_pd(STILL_P4*vsolv[i+1]); - - /* Load limits for loop over neighbors */ - nj0 = jindex[4*i]; - nj1 = jindex[4*i+1]; - nj2 = jindex[4*i+2]; - nj3 = jindex[4*i+3]; - - pmask0 = aadata->prologue_mask_gb[i]; - pmask1 = aadata->prologue_mask_gb[i+1]; - emask0 = aadata->epilogue_mask[i]; - emask1 = aadata->epilogue_mask[i+1]; - - imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i)); - imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2)); - - /* Prologue part, including exclusion mask */ - for (j = nj0; j < nj1; j += UNROLLJ) - { - jmask_SSE0 = _mm_load_pd((double *)pmask0); - jmask_SSE1 = _mm_load_pd((double *)pmask1); - pmask0 += 2*UNROLLJ; - pmask1 += 2*UNROLLJ; - - /* load j atom coordinates */ - jx_SSE = _mm_load_pd(x_align+j); - jy_SSE = _mm_load_pd(y_align+j); - jz_SSE = _mm_load_pd(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1); - - /* Combine masks */ - jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0); - jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1); - - /* Apply mask */ - rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0); - rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1); - - irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0); - irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1); - idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0); - idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1); - idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0); - idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1); - - raj_SSE = _mm_load_pd(gb_radius+j); - vaj_SSE = _mm_load_pd(vsolv+j); - - rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE); - rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE); - - ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0))); - ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1))); - - ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE); - ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE); - theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE); - theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE); - gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0); - gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1); - term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0)); - term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1)); - ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0); - ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1); - dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0), - _mm_mul_pd(sinq_SSE0, theta_SSE0)); - dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1), - _mm_mul_pd(sinq_SSE1, theta_SSE1)); - - prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE); - icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0); - icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1); - icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0); - icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1); - - _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j), - _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0), - _mm_mul_pd(prod_ai_SSE1, icf4_SSE1)))); - - - gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0)); - gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1)); - - /* Save ai->aj and aj->ai chain rule terms */ - _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1)); - dadx += 2; - - _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1)); - dadx += 2; - } - - /* Main part, no exclusions */ - for (j = nj1; j < nj2; j += UNROLLJ) - { - - /* load j atom coordinates */ - jx_SSE = _mm_load_pd(x_align+j); - jy_SSE = _mm_load_pd(y_align+j); - jz_SSE = _mm_load_pd(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1); - - /* Apply mask */ - rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0); - rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1); - - irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0); - irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1); - idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0); - idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1); - idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0); - idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1); - - raj_SSE = _mm_load_pd(gb_radius+j); - - rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE); - rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE); - vaj_SSE = _mm_load_pd(vsolv+j); - - ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0))); - ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1))); - - ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE); - ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE); - theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE); - theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE); - gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0); - gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1); - term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0)); - term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1)); - ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0); - ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1); - dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0), - _mm_mul_pd(sinq_SSE0, theta_SSE0)); - dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1), - _mm_mul_pd(sinq_SSE1, theta_SSE1)); - - prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE ); - icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0); - icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1); - icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0); - icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1); - - _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j), - _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0), - _mm_mul_pd(prod_ai_SSE1, icf4_SSE1)))); - - gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0)); - gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1)); - - /* Save ai->aj and aj->ai chain rule terms */ - _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1)); - dadx += 2; - - _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1)); - dadx += 2; - } - /* Epilogue part, including exclusion mask */ - for (j = nj2; j < nj3; j += UNROLLJ) - { - jmask_SSE0 = _mm_load_pd((double *)emask0); - jmask_SSE1 = _mm_load_pd((double *)emask1); - emask0 += 2*UNROLLJ; - emask1 += 2*UNROLLJ; - - /* load j atom coordinates */ - jx_SSE = _mm_load_pd(x_align+j); - jy_SSE = _mm_load_pd(y_align+j); - jz_SSE = _mm_load_pd(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1); - - /* Combine masks */ - jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0); - jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1); - - /* Apply mask */ - rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0); - rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1); - - irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0); - irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1); - idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0); - idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1); - idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0); - idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1); - - raj_SSE = _mm_load_pd(gb_radius+j); - vaj_SSE = _mm_load_pd(vsolv+j); - - rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE); - rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE); - - ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0))); - ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1))); - - ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE); - ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE); - theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE); - theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE); - gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0); - gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1); - term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0)); - term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1)); - ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0); - ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1); - dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0), - _mm_mul_pd(sinq_SSE0, theta_SSE0)); - dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1), - _mm_mul_pd(sinq_SSE1, theta_SSE1)); - - prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE); - icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0); - icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1); - icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0); - icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1); - - _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j), - _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0), - _mm_mul_pd(prod_ai_SSE1, icf4_SSE1)))); - - gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0)); - gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1)); - - /* Save ai->aj and aj->ai chain rule terms */ - _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1)); - dadx += 2; - - _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1)); - dadx += 2; - } - GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1); - gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1); - _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i))); - } - - /* In case we have written anything beyond natoms, move it back. - * Never mind that we leave stuff above natoms; that will not - * be accessed later in the routine. - * In principle this should be a move rather than sum, but this - * way we dont have to worry about even/odd offsets... - */ - for (i = natoms; i < ni1+1+natoms/2; i++) - { - work[i-natoms] += work[i]; - } - - /* Parallel summations would go here if ever implemented with DD */ - - factor = 0.5 * ONE_4PI_EPS0; - /* Calculate the radii - should we do all atoms, or just our local ones? */ - for (i = 0; i < natoms; i++) - { - if (born->use[i] != 0) - { - gpi = born->gpol[i]+work[i]; - gpi2 = gpi * gpi; - born->bRad[i] = factor*gmx_invsqrt(gpi2); - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - } - } - - return 0; -} -/* Reinstate MSVC optimization */ -#ifdef _MSC_VER -#pragma optimize("",on) -#endif - - -int -genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - int gb_algorithm, - gmx_localtop_t * top, - double * x, - t_commrec * cr, - void * paadata) -{ - gmx_allvsallgb2_data_t *aadata; - int natoms; - int ni0, ni1; - int nj0, nj1, nj2, nj3; - int i, j, k, n; - int * mask; - int * pmask0; - int * pmask1; - int * emask0; - int * emask1; - double * gb_radius; - double * vsolv; - double * work; - double tmpsum[2]; - double * x_align; - double * y_align; - double * z_align; - int * jindex; - double * dadx; - double * obc_param; - double rad, min_rad; - double rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain; - - __m128d ix_SSE0, iy_SSE0, iz_SSE0; - __m128d ix_SSE1, iy_SSE1, iz_SSE1; - __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0; - __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1; - __m128d imask_SSE0, jmask_SSE0; - __m128d imask_SSE1, jmask_SSE1; - __m128d jx_SSE, jy_SSE, jz_SSE; - __m128d dx_SSE0, dy_SSE0, dz_SSE0; - __m128d dx_SSE1, dy_SSE1, dz_SSE1; - __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0; - __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1; - __m128d raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE; - __m128d ccf_SSE0, dccf_SSE0, prod_SSE0; - __m128d ccf_SSE1, dccf_SSE1, prod_SSE1; - __m128d icf4_SSE0, icf6_SSE0; - __m128d icf4_SSE1, icf6_SSE1; - __m128d oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE; - __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE; - __m128d rai_inv_SSE0; - __m128d rai_inv_SSE1; - __m128d sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0; - __m128d sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1; - __m128d lij_inv_SSE0, sk2_rinv_SSE0; - __m128d lij_inv_SSE1, sk2_rinv_SSE1; - __m128d dr_SSE0; - __m128d dr_SSE1; - __m128d t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0; - __m128d t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1; - __m128d obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0; - __m128d obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1; - __m128d uij_SSE0, uij2_SSE0, uij3_SSE0; - __m128d uij_SSE1, uij2_SSE1, uij3_SSE1; - __m128d lij_SSE0, lij2_SSE0, lij3_SSE0; - __m128d lij_SSE1, lij2_SSE1, lij3_SSE1; - __m128d dlij_SSE0, diff2_SSE0, logterm_SSE0; - __m128d dlij_SSE1, diff2_SSE1, logterm_SSE1; - __m128d doffset_SSE, tmpSSE; - - natoms = mdatoms->nr; - ni0 = 0; - ni1 = mdatoms->homenr; - - n = 0; - - aadata = *((gmx_allvsallgb2_data_t **)paadata); - - - if (aadata == NULL) - { - genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset, - egbOBC, TRUE, TRUE, TRUE); - *((gmx_allvsallgb2_data_t **)paadata) = aadata; - } - - x_align = aadata->x_align; - y_align = aadata->y_align; - z_align = aadata->z_align; - - gb_radius = aadata->gb_radius; - work = aadata->work; - jindex = aadata->jindex_gb; - dadx = fr->dadx; - obc_param = aadata->workparam; - - oneeighth_SSE = _mm_set1_pd(0.125); - onefourth_SSE = _mm_set1_pd(0.25); - half_SSE = _mm_set1_pd(0.5); - one_SSE = _mm_set1_pd(1.0); - two_SSE = _mm_set1_pd(2.0); - four_SSE = _mm_set1_pd(4.0); - doffset_SSE = _mm_set1_pd(born->gb_doffset); - - for (i = 0; i < natoms; i++) - { - x_align[i] = x[3*i]; - y_align[i] = x[3*i+1]; - z_align[i] = x[3*i+2]; - } - - /* Copy again */ - for (i = 0; i < natoms/2+1; i++) - { - x_align[natoms+i] = x_align[i]; - y_align[natoms+i] = y_align[i]; - z_align[natoms+i] = z_align[i]; - } - - for (i = 0; i < natoms+natoms/2+1; i++) - { - work[i] = 0; - } - - for (i = ni0; i < ni1; i += UNROLLI) - { - /* We assume shifts are NOT used for all-vs-all interactions */ - - /* Load i atom data */ - ix_SSE0 = _mm_load1_pd(x_align+i); - iy_SSE0 = _mm_load1_pd(y_align+i); - iz_SSE0 = _mm_load1_pd(z_align+i); - ix_SSE1 = _mm_load1_pd(x_align+i+1); - iy_SSE1 = _mm_load1_pd(y_align+i+1); - iz_SSE1 = _mm_load1_pd(z_align+i+1); - - rai_SSE0 = _mm_load1_pd(gb_radius+i); - rai_SSE1 = _mm_load1_pd(gb_radius+i+1); - rai_inv_SSE0 = gmx_mm_inv_pd(rai_SSE0); - rai_inv_SSE1 = gmx_mm_inv_pd(rai_SSE1); - - sk_ai_SSE0 = _mm_load1_pd(obc_param+i); - sk_ai_SSE1 = _mm_load1_pd(obc_param+i+1); - sk2_ai_SSE0 = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0); - sk2_ai_SSE1 = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1); - - sum_ai_SSE0 = _mm_setzero_pd(); - sum_ai_SSE1 = _mm_setzero_pd(); - - /* Load limits for loop over neighbors */ - nj0 = jindex[4*i]; - nj1 = jindex[4*i+1]; - nj2 = jindex[4*i+2]; - nj3 = jindex[4*i+3]; - - pmask0 = aadata->prologue_mask_gb[i]; - pmask1 = aadata->prologue_mask_gb[i+1]; - emask0 = aadata->epilogue_mask[i]; - emask1 = aadata->epilogue_mask[i+1]; - - imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i)); - imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2)); - - /* Prologue part, including exclusion mask */ - for (j = nj0; j < nj1; j += UNROLLJ) - { - jmask_SSE0 = _mm_load_pd((double *)pmask0); - jmask_SSE1 = _mm_load_pd((double *)pmask1); - pmask0 += 2*UNROLLJ; - pmask1 += 2*UNROLLJ; - - /* load j atom coordinates */ - jx_SSE = _mm_load_pd(x_align+j); - jy_SSE = _mm_load_pd(y_align+j); - jz_SSE = _mm_load_pd(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1); - - /* Combine masks */ - jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0); - jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1); - - /* Apply mask */ - rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0); - rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1); - - dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0); - dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1); - - sk_aj_SSE = _mm_load_pd(obc_param+j); - raj_SSE = _mm_load_pd(gb_radius+j); - raj_inv_SSE = gmx_mm_inv_pd(raj_SSE); - - /* Evaluate influence of atom aj -> ai */ - t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE); - t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE); - t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE); - t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE); - t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0); - t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1); - - obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1); - obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1); - obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1); - obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0); - obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1); - - uij_SSE0 = gmx_mm_inv_pd(t1_SSE0); - uij_SSE1 = gmx_mm_inv_pd(t1_SSE1); - lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)), - _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0)); - lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)), - _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1)); - dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1); - - uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1); - uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1); - lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1); - lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1); - - diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1); - lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1); - sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE); - sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1); - prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1); - - logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1)); - - t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1); - t2_SSE0 = _mm_mul_pd(diff2_SSE0, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_pd(diff2_SSE1, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1), - prod_SSE1)); - - t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1)); - t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1)); - t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0)); - t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1)); - t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1); - t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1)); - - sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - - t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0), - _mm_mul_pd(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1), - _mm_mul_pd(prod_SSE1, lij3_SSE1)); - t1_SSE0 = _mm_sub_pd(t1_SSE0, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0), - _mm_mul_pd(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_pd(t1_SSE1, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1), - _mm_mul_pd(lij3_SSE1, dr_SSE1)))); - - t2_SSE0 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0), - _mm_mul_pd(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1), - _mm_mul_pd(uij3_SSE1, dr_SSE1))); - t2_SSE0 = _mm_sub_pd(t2_SSE0, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0), - _mm_mul_pd(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_pd(t2_SSE1, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1), - _mm_mul_pd(prod_SSE1, uij3_SSE1))); - t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0), - _mm_mul_pd(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1), - _mm_mul_pd(rinv_SSE1, rinv_SSE1)); - t3_SSE0 = _mm_sub_pd(t3_SSE0, - _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_pd(t3_SSE1, - _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1)))); - - t1_SSE0 = _mm_mul_pd(rinv_SSE0, - _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0), - _mm_add_pd(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_pd(rinv_SSE1, - _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1), - _mm_add_pd(t2_SSE1, t3_SSE1))); - - _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - dadx += 2; - - /* Evaluate influence of atom ai -> aj */ - t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0); - t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1); - t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0); - t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1); - t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0); - t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1); - - obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1); - obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1); - obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1); - obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0); - obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1); - - uij_SSE0 = gmx_mm_inv_pd(t1_SSE0); - uij_SSE1 = gmx_mm_inv_pd(t1_SSE1); - lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)), - _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE)); - lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)), - _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE)); - dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1); - - uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1); - uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1); - lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1); - lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1); - - diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1); - lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1); - sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1); - prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1); - - logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1)); - t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1); - t2_SSE0 = _mm_mul_pd(diff2_SSE0, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_pd(diff2_SSE1, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1)); - t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1)); - t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0)); - t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1)); - t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1); - t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1)); - - _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j), - _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0), - _mm_and_pd(t1_SSE1, obc_mask1_SSE1)))); - - t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0), - _mm_mul_pd(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1), - _mm_mul_pd(prod_SSE1, lij3_SSE1)); - t1_SSE0 = _mm_sub_pd(t1_SSE0, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0), - _mm_mul_pd(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_pd(t1_SSE1, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1), - _mm_mul_pd(lij3_SSE1, dr_SSE1)))); - t2_SSE0 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0), - _mm_mul_pd(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1), - _mm_mul_pd(uij3_SSE1, dr_SSE1))); - t2_SSE0 = _mm_sub_pd(t2_SSE0, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0), - _mm_mul_pd(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_pd(t2_SSE1, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1), - _mm_mul_pd(prod_SSE1, uij3_SSE1))); - - t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0), - _mm_mul_pd(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1), - _mm_mul_pd(rinv_SSE1, rinv_SSE1)); - - t3_SSE0 = _mm_sub_pd(t3_SSE0, - _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_pd(t3_SSE1, - _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1)))); - - - t1_SSE0 = _mm_mul_pd(rinv_SSE0, - _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0), - _mm_add_pd(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_pd(rinv_SSE1, - _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1), - _mm_add_pd(t2_SSE1, t3_SSE1))); - - _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - dadx += 2; - } - - /* Main part, no exclusions */ - for (j = nj1; j < nj2; j += UNROLLJ) - { - /* load j atom coordinates */ - jx_SSE = _mm_load_pd(x_align+j); - jy_SSE = _mm_load_pd(y_align+j); - jz_SSE = _mm_load_pd(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1); - - /* Apply mask */ - rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0); - rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1); - - dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0); - dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1); - - sk_aj_SSE = _mm_load_pd(obc_param+j); - raj_SSE = _mm_load_pd(gb_radius+j); - - raj_inv_SSE = gmx_mm_inv_pd(raj_SSE); - - /* Evaluate influence of atom aj -> ai */ - t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE); - t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE); - t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE); - t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE); - t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0); - t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1); - - obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1); - obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1); - obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1); - obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0); - obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1); - - uij_SSE0 = gmx_mm_inv_pd(t1_SSE0); - uij_SSE1 = gmx_mm_inv_pd(t1_SSE1); - lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)), - _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0)); - lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)), - _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1)); - dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1); - - uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1); - uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1); - lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1); - lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1); - - diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1); - lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1); - sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE); - sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1); - prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1); - - logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1)); - - t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1); - t2_SSE0 = _mm_mul_pd(diff2_SSE0, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_pd(diff2_SSE1, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1), - prod_SSE1)); - - t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1)); - t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1)); - t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0)); - t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1)); - t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1); - t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1)); - - sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - - t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0), - _mm_mul_pd(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1), - _mm_mul_pd(prod_SSE1, lij3_SSE1)); - - t1_SSE0 = _mm_sub_pd(t1_SSE0, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0), - _mm_mul_pd(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_pd(t1_SSE1, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1), - _mm_mul_pd(lij3_SSE1, dr_SSE1)))); - - t2_SSE0 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0), - _mm_mul_pd(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1), - _mm_mul_pd(uij3_SSE1, dr_SSE1))); - t2_SSE0 = _mm_sub_pd(t2_SSE0, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0), - _mm_mul_pd(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_pd(t2_SSE1, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1), - _mm_mul_pd(prod_SSE1, uij3_SSE1))); - t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0), - _mm_mul_pd(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1), - _mm_mul_pd(rinv_SSE1, rinv_SSE1)); - t3_SSE0 = _mm_sub_pd(t3_SSE0, - _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_pd(t3_SSE1, - _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1)))); - - t1_SSE0 = _mm_mul_pd(rinv_SSE0, - _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0), - _mm_add_pd(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_pd(rinv_SSE1, - _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1), - _mm_add_pd(t2_SSE1, t3_SSE1))); - - _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - dadx += 2; - - /* Evaluate influence of atom ai -> aj */ - t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0); - t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1); - t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0); - t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1); - t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0); - t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1); - - obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1); - obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1); - obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1); - obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0); - obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1); - - uij_SSE0 = gmx_mm_inv_pd(t1_SSE0); - uij_SSE1 = gmx_mm_inv_pd(t1_SSE1); - lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)), - _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE)); - lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)), - _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE)); - dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1); - - uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1); - uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1); - lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1); - lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1); - - diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1); - lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1); - sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1); - prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1); - - logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1)); - t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1); - t2_SSE0 = _mm_mul_pd(diff2_SSE0, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_pd(diff2_SSE1, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1)); - t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1)); - t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0)); - t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1)); - t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1); - t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1)); - - _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j), - _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0), - _mm_and_pd(t1_SSE1, obc_mask1_SSE1)))); - - t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0), - _mm_mul_pd(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1), - _mm_mul_pd(prod_SSE1, lij3_SSE1)); - t1_SSE0 = _mm_sub_pd(t1_SSE0, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0), - _mm_mul_pd(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_pd(t1_SSE1, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1), - _mm_mul_pd(lij3_SSE1, dr_SSE1)))); - t2_SSE0 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0), - _mm_mul_pd(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1), - _mm_mul_pd(uij3_SSE1, dr_SSE1))); - t2_SSE0 = _mm_sub_pd(t2_SSE0, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0), - _mm_mul_pd(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_pd(t2_SSE1, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1), - _mm_mul_pd(prod_SSE1, uij3_SSE1))); - - t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0), - _mm_mul_pd(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1), - _mm_mul_pd(rinv_SSE1, rinv_SSE1)); - - t3_SSE0 = _mm_sub_pd(t3_SSE0, - _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_pd(t3_SSE1, - _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1)))); - - t1_SSE0 = _mm_mul_pd(rinv_SSE0, - _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0), - _mm_add_pd(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_pd(rinv_SSE1, - _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1), - _mm_add_pd(t2_SSE1, t3_SSE1))); - - _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - dadx += 2; - } - - /* Epilogue part, including exclusion mask */ - for (j = nj2; j < nj3; j += UNROLLJ) - { - jmask_SSE0 = _mm_load_pd((double *)emask0); - jmask_SSE1 = _mm_load_pd((double *)emask1); - emask0 += 2*UNROLLJ; - emask1 += 2*UNROLLJ; - - /* load j atom coordinates */ - jx_SSE = _mm_load_pd(x_align+j); - jy_SSE = _mm_load_pd(y_align+j); - jz_SSE = _mm_load_pd(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1); - - /* Combine masks */ - jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0); - jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1); - - /* Apply mask */ - rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0); - rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1); - - dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0); - dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1); - - sk_aj_SSE = _mm_load_pd(obc_param+j); - raj_SSE = _mm_load_pd(gb_radius+j); - - raj_inv_SSE = gmx_mm_inv_pd(raj_SSE); - - /* Evaluate influence of atom aj -> ai */ - t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE); - t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE); - t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE); - t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE); - t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0); - t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1); - - obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1); - obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1); - obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1); - obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0); - obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1); - - uij_SSE0 = gmx_mm_inv_pd(t1_SSE0); - uij_SSE1 = gmx_mm_inv_pd(t1_SSE1); - lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)), - _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0)); - lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)), - _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1)); - - dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1); - - uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1); - uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1); - lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1); - lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1); - - diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1); - lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1); - sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE); - sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1); - prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1); - - logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1)); - - t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1); - t2_SSE0 = _mm_mul_pd(diff2_SSE0, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_pd(diff2_SSE1, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1), - prod_SSE1)); - - t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1)); - t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1)); - t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0)); - t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1)); - t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1); - t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1)); - - sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - - t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0), - _mm_mul_pd(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1), - _mm_mul_pd(prod_SSE1, lij3_SSE1)); - t1_SSE0 = _mm_sub_pd(t1_SSE0, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0), - _mm_mul_pd(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_pd(t1_SSE1, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1), - _mm_mul_pd(lij3_SSE1, dr_SSE1)))); - - t2_SSE0 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0), - _mm_mul_pd(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1), - _mm_mul_pd(uij3_SSE1, dr_SSE1))); - t2_SSE0 = _mm_sub_pd(t2_SSE0, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0), - _mm_mul_pd(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_pd(t2_SSE1, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1), - _mm_mul_pd(prod_SSE1, uij3_SSE1))); - t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0), - _mm_mul_pd(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1), - _mm_mul_pd(rinv_SSE1, rinv_SSE1)); - t3_SSE0 = _mm_sub_pd(t3_SSE0, - _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_pd(t3_SSE1, - _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1)))); - - t1_SSE0 = _mm_mul_pd(rinv_SSE0, - _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0), - _mm_add_pd(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_pd(rinv_SSE1, - _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1), - _mm_add_pd(t2_SSE1, t3_SSE1))); - - _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - dadx += 2; - - /* Evaluate influence of atom ai -> aj */ - t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0); - t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1); - t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0); - t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1); - t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0); - t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1); - - obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1); - obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1); - obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1); - obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0); - obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1); - - uij_SSE0 = gmx_mm_inv_pd(t1_SSE0); - uij_SSE1 = gmx_mm_inv_pd(t1_SSE1); - lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)), - _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE)); - lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)), - _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE)); - - dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1); - - uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1); - uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1); - lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1); - lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1); - - diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1); - lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1); - sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1); - prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1); - - logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1)); - t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1); - t2_SSE0 = _mm_mul_pd(diff2_SSE0, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_pd(diff2_SSE1, - _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1)); - t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1)); - t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0)); - t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1)); - t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1); - t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1)); - - _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j), - _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0), - _mm_and_pd(t1_SSE1, obc_mask1_SSE1)))); - - t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0), - _mm_mul_pd(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1), - _mm_mul_pd(prod_SSE1, lij3_SSE1)); - - t1_SSE0 = _mm_sub_pd(t1_SSE0, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0), - _mm_mul_pd(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_pd(t1_SSE1, - _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1), - _mm_mul_pd(lij3_SSE1, dr_SSE1)))); - t2_SSE0 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0), - _mm_mul_pd(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_pd(onefourth_SSE, - _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1), - _mm_mul_pd(uij3_SSE1, dr_SSE1))); - t2_SSE0 = _mm_sub_pd(t2_SSE0, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0), - _mm_mul_pd(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_pd(t2_SSE1, - _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1), - _mm_mul_pd(prod_SSE1, uij3_SSE1))); - - t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0), - _mm_mul_pd(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1), - _mm_mul_pd(rinv_SSE1, rinv_SSE1)); - - t3_SSE0 = _mm_sub_pd(t3_SSE0, - _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_pd(t3_SSE1, - _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE), - _mm_add_pd(one_SSE, - _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1)))); - - t1_SSE0 = _mm_mul_pd(rinv_SSE0, - _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0), - _mm_add_pd(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_pd(rinv_SSE1, - _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1), - _mm_add_pd(t2_SSE1, t3_SSE1))); - - _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0)); - dadx += 2; - _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1)); - dadx += 2; - } - GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1); - sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1); - _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i))); - } - - - for (i = 0; i < natoms/2+1; i++) - { - work[i] += work[natoms+i]; - } - - /* Parallel summations would go here if ever implemented in DD */ - - if (gb_algorithm == egbHCT) - { - /* HCT */ - for (i = 0; i < natoms; i++) - { - if (born->use[i] != 0) - { - rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset; - sum_ai = 1.0/rai - work[i]; - min_rad = rai + born->gb_doffset; - rad = 1.0/sum_ai; - - born->bRad[i] = rad > min_rad ? rad : min_rad; - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - } - } - - } - else - { - /* OBC */ - - /* Calculate the radii */ - for (i = 0; i < natoms; i++) - { - - if (born->use[i] != 0) - { - rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]; - rai_inv2 = 1.0/rai; - rai = rai-born->gb_doffset; - rai_inv = 1.0/rai; - sum_ai = rai * work[i]; - sum_ai2 = sum_ai * sum_ai; - sum_ai3 = sum_ai2 * sum_ai; - - tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3); - born->bRad[i] = rai_inv - tsum*rai_inv2; - born->bRad[i] = 1.0 / born->bRad[i]; - - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - - tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2); - born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2; - } - } - } - - return 0; -} - - - - - - - - -int -genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - double * x, - double * f, - int gb_algorithm, - void * paadata) -{ - gmx_allvsallgb2_data_t *aadata; - int natoms; - int ni0, ni1; - int nj0, nj1, nj2, nj3; - int i, j, k, n; - int idx; - int * mask; - int * pmask0; - int * emask0; - int * jindex; - - double ix, iy, iz; - double fix, fiy, fiz; - double jx, jy, jz; - double dx, dy, dz; - double tx, ty, tz; - double rbai, rbaj, fgb, fgb_ai, rbi; - double * rb; - double * dadx; - double * x_align; - double * y_align; - double * z_align; - double * fx_align; - double * fy_align; - double * fz_align; - double tmpsum[2]; - - __m128d jmask_SSE0, jmask_SSE1; - __m128d ix_SSE0, iy_SSE0, iz_SSE0; - __m128d ix_SSE1, iy_SSE1, iz_SSE1; - __m128d fix_SSE0, fiy_SSE0, fiz_SSE0; - __m128d fix_SSE1, fiy_SSE1, fiz_SSE1; - __m128d rbai_SSE0, rbai_SSE1; - __m128d imask_SSE0, imask_SSE1; - __m128d jx_SSE, jy_SSE, jz_SSE, rbaj_SSE; - __m128d dx_SSE0, dy_SSE0, dz_SSE0; - __m128d dx_SSE1, dy_SSE1, dz_SSE1; - __m128d fgb_SSE0, fgb_ai_SSE0; - __m128d fgb_SSE1, fgb_ai_SSE1; - __m128d tx_SSE0, ty_SSE0, tz_SSE0; - __m128d tx_SSE1, ty_SSE1, tz_SSE1; - __m128d t1, t2, tmpSSE; - - natoms = mdatoms->nr; - ni0 = 0; - ni1 = mdatoms->homenr; - - aadata = (gmx_allvsallgb2_data_t *)paadata; - - x_align = aadata->x_align; - y_align = aadata->y_align; - z_align = aadata->z_align; - fx_align = aadata->fx_align; - fy_align = aadata->fy_align; - fz_align = aadata->fz_align; - - jindex = aadata->jindex_gb; - dadx = fr->dadx; - - n = 0; - rb = aadata->work; - - /* Loop to get the proper form for the Born radius term */ - if (gb_algorithm == egbSTILL) - { - for (i = 0; i < natoms; i++) - { - rbi = born->bRad[i]; - rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0; - } - } - else if (gb_algorithm == egbHCT) - { - for (i = 0; i < natoms; i++) - { - rbi = born->bRad[i]; - rb[i] = rbi * rbi * fr->dvda[i]; - } - } - else if (gb_algorithm == egbOBC) - { - for (idx = 0; idx < natoms; idx++) - { - rbi = born->bRad[idx]; - rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx]; - } - } - - for (i = 0; i < 2*natoms; i++) - { - fx_align[i] = 0; - fy_align[i] = 0; - fz_align[i] = 0; - } - - - for (i = 0; i < natoms; i++) - { - rb[i+natoms] = rb[i]; - } - - for (i = ni0; i < ni1; i += UNROLLI) - { - /* We assume shifts are NOT used for all-vs-all interactions */ - - /* Load i atom data */ - ix_SSE0 = _mm_load1_pd(x_align+i); - iy_SSE0 = _mm_load1_pd(y_align+i); - iz_SSE0 = _mm_load1_pd(z_align+i); - ix_SSE1 = _mm_load1_pd(x_align+i+1); - iy_SSE1 = _mm_load1_pd(y_align+i+1); - iz_SSE1 = _mm_load1_pd(z_align+i+1); - - fix_SSE0 = _mm_setzero_pd(); - fiy_SSE0 = _mm_setzero_pd(); - fiz_SSE0 = _mm_setzero_pd(); - fix_SSE1 = _mm_setzero_pd(); - fiy_SSE1 = _mm_setzero_pd(); - fiz_SSE1 = _mm_setzero_pd(); - - rbai_SSE0 = _mm_load1_pd(rb+i); - rbai_SSE1 = _mm_load1_pd(rb+i+1); - - /* Load limits for loop over neighbors */ - nj0 = jindex[4*i]; - nj3 = jindex[4*i+3]; - - /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */ - for (j = nj0; j < nj3; j += UNROLLJ) - { - /* load j atom coordinates */ - jx_SSE = _mm_load_pd(x_align+j); - jy_SSE = _mm_load_pd(y_align+j); - jz_SSE = _mm_load_pd(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE); - - rbaj_SSE = _mm_load_pd(rb+j); - - fgb_SSE0 = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx)); - dadx += 2; - fgb_SSE1 = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx)); - dadx += 2; - - fgb_ai_SSE0 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx)); - dadx += 2; - fgb_ai_SSE1 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx)); - dadx += 2; - - /* Total force between ai and aj is the sum of ai->aj and aj->ai */ - fgb_SSE0 = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0); - fgb_SSE1 = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1); - - /* Calculate temporary vectorial force */ - tx_SSE0 = _mm_mul_pd(fgb_SSE0, dx_SSE0); - ty_SSE0 = _mm_mul_pd(fgb_SSE0, dy_SSE0); - tz_SSE0 = _mm_mul_pd(fgb_SSE0, dz_SSE0); - tx_SSE1 = _mm_mul_pd(fgb_SSE1, dx_SSE1); - ty_SSE1 = _mm_mul_pd(fgb_SSE1, dy_SSE1); - tz_SSE1 = _mm_mul_pd(fgb_SSE1, dz_SSE1); - - /* Increment i atom force */ - fix_SSE0 = _mm_add_pd(fix_SSE0, tx_SSE0); - fiy_SSE0 = _mm_add_pd(fiy_SSE0, ty_SSE0); - fiz_SSE0 = _mm_add_pd(fiz_SSE0, tz_SSE0); - fix_SSE1 = _mm_add_pd(fix_SSE1, tx_SSE1); - fiy_SSE1 = _mm_add_pd(fiy_SSE1, ty_SSE1); - fiz_SSE1 = _mm_add_pd(fiz_SSE1, tz_SSE1); - - /* Decrement j atom force */ - _mm_store_pd(fx_align+j, - _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) )); - _mm_store_pd(fy_align+j, - _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) )); - _mm_store_pd(fz_align+j, - _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) )); - } - - /* Add i forces to mem */ - GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1); - fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1); - _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i))); - - GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1); - fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1); - _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i))); - - GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1); - fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1); - _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i))); - } - - for (i = 0; i < natoms; i++) - { - f[3*i] += fx_align[i] + fx_align[natoms+i]; - f[3*i+1] += fy_align[i] + fy_align[natoms+i]; - f[3*i+2] += fz_align[i] + fz_align[natoms+i]; - } - - return 0; -} - -#else -/* dummy variable when not using SSE */ -int genborn_allvsall_sse2_double_dummy; - - -#endif diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_double.h b/src/gromacs/mdlib/genborn_allvsall_sse2_double.h deleted file mode 100644 index 3629475dc3..0000000000 --- a/src/gromacs/mdlib/genborn_allvsall_sse2_double.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2009, The GROMACS Development Team. - * Copyright (c) 2010,2014, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#ifndef _GENBORN_ALLVSALL_SSE2_DOUBLE_H -#define _GENBORN_ALLVSALL_SSE2_DOUBLE_H - -#include "gromacs/legacyheaders/typedefs.h" -#include "gromacs/legacyheaders/types/simple.h" - -int -genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - gmx_localtop_t * top, - double * x, - t_commrec * cr, - void * work); - -int -genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - int gb_algorithm, - gmx_localtop_t * top, - double * x, - t_commrec * cr, - void * work); - -int -genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - double * x, - double * f, - int gb_algorithm, - void * work); - -#endif diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_single.c b/src/gromacs/mdlib/genborn_allvsall_sse2_single.c deleted file mode 100644 index 8c3ce47c99..0000000000 --- a/src/gromacs/mdlib/genborn_allvsall_sse2_single.c +++ /dev/null @@ -1,3500 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2009, The GROMACS Development Team. - * Copyright (c) 2012,2014, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#include "gmxpre.h" - -#include - -#include "gromacs/legacyheaders/genborn.h" -#include "gromacs/legacyheaders/network.h" -#include "gromacs/legacyheaders/types/simple.h" -#include "gromacs/math/units.h" -#include "gromacs/math/vec.h" -#include "gromacs/mdlib/genborn_allvsall.h" -#include "gromacs/utility/smalloc.h" - -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - -#include - - -#define SIMD_WIDTH 4 -#define UNROLLI 4 -#define UNROLLJ 4 - - - - - - - - - -typedef struct -{ - int * jindex_gb; - int ** prologue_mask_gb; - int ** epilogue_mask; - int * imask; - real * gb_radius; - real * workparam; - real * work; - real * x_align; - real * y_align; - real * z_align; - real * fx_align; - real * fy_align; - real * fz_align; -} -gmx_allvsallgb2_data_t; - - -static int -calc_maxoffset(int i, int natoms) -{ - int maxoffset; - - if ((natoms % 2) == 1) - { - /* Odd number of atoms, easy */ - maxoffset = natoms/2; - } - else if ((natoms % 4) == 0) - { - /* Multiple of four is hard */ - if (i < natoms/2) - { - if ((i % 2) == 0) - { - maxoffset = natoms/2; - } - else - { - maxoffset = natoms/2-1; - } - } - else - { - if ((i % 2) == 1) - { - maxoffset = natoms/2; - } - else - { - maxoffset = natoms/2-1; - } - } - } - else - { - /* natoms/2 = odd */ - if ((i % 2) == 0) - { - maxoffset = natoms/2; - } - else - { - maxoffset = natoms/2-1; - } - } - - return maxoffset; -} - -static void -setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata, - t_ilist * ilist, - int start, - int end, - int natoms, - gmx_bool bInclude12, - gmx_bool bInclude13, - gmx_bool bInclude14) -{ - int i, j, k, tp; - int a1, a2; - int ni0, ni1, nj0, nj1, nj; - int imin, imax, iexcl; - int max_offset; - int max_excl_offset; - int firstinteraction; - int ibase; - int *pi; - - /* This routine can appear to be a bit complex, but it is mostly book-keeping. - * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates - * whether they should interact or not. - * - * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction - * should be present, otherwise 0. Since exclusions typically only occur when i & j are close, - * we create a jindex array with three elements per i atom: the starting point, the point to - * which we need to check exclusions, and the end point. - * This way we only have to allocate a short exclusion mask per i atom. - */ - - ni0 = (start/UNROLLI)*UNROLLI; - ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI; - - /* Set the interaction mask to only enable the i atoms we want to include */ - snew(pi, natoms+UNROLLI+2*SIMD_WIDTH); - aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15))); - for (i = 0; i < natoms+UNROLLI; i++) - { - aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0; - } - - /* Allocate memory for our modified jindex array */ - snew(aadata->jindex_gb, 4*(natoms+UNROLLI)); - for (i = 0; i < 4*(natoms+UNROLLI); i++) - { - aadata->jindex_gb[i] = 0; - } - - /* Create the exclusion masks for the prologue part */ - snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */ - - /* First zero everything to avoid uninitialized data */ - for (i = 0; i < natoms+UNROLLI; i++) - { - aadata->prologue_mask_gb[i] = NULL; - } - - /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */ - for (ibase = ni0; ibase < ni1; ibase += UNROLLI) - { - max_excl_offset = -1; - - /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */ - imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end; - - /* Which atom is the first we (might) interact with? */ - imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */ - for (i = ibase; i < imax; i++) - { - /* Before exclusions, which atom is the first we (might) interact with? */ - firstinteraction = i+1; - max_offset = calc_maxoffset(i, natoms); - - if (!bInclude12) - { - for (j = 0; j < ilist[F_GB12].nr; j += 3) - { - a1 = ilist[F_GB12].iatoms[j+1]; - a2 = ilist[F_GB12].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k == firstinteraction) - { - firstinteraction++; - } - } - } - if (!bInclude13) - { - for (j = 0; j < ilist[F_GB13].nr; j += 3) - { - a1 = ilist[F_GB13].iatoms[j+1]; - a2 = ilist[F_GB13].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k == firstinteraction) - { - firstinteraction++; - } - } - } - if (!bInclude14) - { - for (j = 0; j < ilist[F_GB14].nr; j += 3) - { - a1 = ilist[F_GB14].iatoms[j+1]; - a2 = ilist[F_GB14].iatoms[j+2]; - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k == firstinteraction) - { - firstinteraction++; - } - } - } - imin = (firstinteraction < imin) ? firstinteraction : imin; - } - /* round down to j unrolling factor */ - imin = (imin/UNROLLJ)*UNROLLJ; - - for (i = ibase; i < imax; i++) - { - max_offset = calc_maxoffset(i, natoms); - - if (!bInclude12) - { - for (j = 0; j < ilist[F_GB12].nr; j += 3) - { - a1 = ilist[F_GB12].iatoms[j+1]; - a2 = ilist[F_GB12].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k < imin) - { - k += natoms; - } - - if (k > i+max_offset) - { - continue; - } - - k = k - imin; - - if (k+natoms <= max_offset) - { - k += natoms; - } - max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset; - } - } - if (!bInclude13) - { - for (j = 0; j < ilist[F_GB13].nr; j += 3) - { - a1 = ilist[F_GB13].iatoms[j+1]; - a2 = ilist[F_GB13].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k < imin) - { - k += natoms; - } - - if (k > i+max_offset) - { - continue; - } - - k = k - imin; - - if (k+natoms <= max_offset) - { - k += natoms; - } - max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset; - } - } - if (!bInclude14) - { - for (j = 0; j < ilist[F_GB14].nr; j += 3) - { - a1 = ilist[F_GB14].iatoms[j+1]; - a2 = ilist[F_GB14].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k < imin) - { - k += natoms; - } - - if (k > i+max_offset) - { - continue; - } - - k = k - imin; - - if (k+natoms <= max_offset) - { - k += natoms; - } - max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset; - } - } - } - - /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */ - max_excl_offset++; - /* round up to j unrolling factor */ - max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ; - - /* Set all the prologue masks length to this value (even for i>end) */ - for (i = ibase; i < ibase+UNROLLI; i++) - { - aadata->jindex_gb[4*i] = imin; - aadata->jindex_gb[4*i+1] = imin+max_excl_offset; - } - } - - /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */ - for (ibase = ni0; ibase < ni1; ibase += UNROLLI) - { - for (i = ibase; i < ibase+UNROLLI; i++) - { - nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i]; - imin = aadata->jindex_gb[4*i]; - - /* Allocate aligned memory */ - snew(pi, nj+2*SIMD_WIDTH); - aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15))); - - max_offset = calc_maxoffset(i, natoms); - - /* Include interactions i+1 <= j < i+maxoffset */ - for (k = 0; k < nj; k++) - { - j = imin + k; - - if ( (j > i) && (j <= i+max_offset) ) - { - aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF; - } - else - { - aadata->prologue_mask_gb[i][k] = 0; - } - } - - /* Clear out the explicit exclusions */ - if (i < end) - { - if (!bInclude12) - { - for (j = 0; j < ilist[F_GB12].nr; j += 3) - { - a1 = ilist[F_GB12].iatoms[j+1]; - a2 = ilist[F_GB12].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k > i+max_offset) - { - continue; - } - k = k-i; - - if (k+natoms <= max_offset) - { - k += natoms; - } - - k = k+i-imin; - if (k >= 0) - { - aadata->prologue_mask_gb[i][k] = 0; - } - } - } - if (!bInclude13) - { - for (j = 0; j < ilist[F_GB13].nr; j += 3) - { - a1 = ilist[F_GB13].iatoms[j+1]; - a2 = ilist[F_GB13].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k > i+max_offset) - { - continue; - } - k = k-i; - - if (k+natoms <= max_offset) - { - k += natoms; - } - - k = k+i-imin; - if (k >= 0) - { - aadata->prologue_mask_gb[i][k] = 0; - } - } - } - if (!bInclude14) - { - for (j = 0; j < ilist[F_GB14].nr; j += 3) - { - a1 = ilist[F_GB14].iatoms[j+1]; - a2 = ilist[F_GB14].iatoms[j+2]; - - if (a1 == i) - { - k = a2; - } - else if (a2 == i) - { - k = a1; - } - else - { - continue; - } - - if (k > i+max_offset) - { - continue; - } - k = k-i; - - if (k+natoms <= max_offset) - { - k += natoms; - } - - k = k+i-imin; - if (k >= 0) - { - aadata->prologue_mask_gb[i][k] = 0; - } - } - } - } - } - } - - /* Construct the epilogue mask - this just contains the check for maxoffset */ - snew(aadata->epilogue_mask, natoms+UNROLLI); - - /* First zero everything to avoid uninitialized data */ - for (i = 0; i < natoms+UNROLLI; i++) - { - aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1]; - aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1]; - aadata->epilogue_mask[i] = NULL; - } - - for (ibase = ni0; ibase < ni1; ibase += UNROLLI) - { - /* Find the lowest index for which we need to use the epilogue */ - imin = ibase; - max_offset = calc_maxoffset(imin, natoms); - - imin = imin + 1 + max_offset; - - /* Find largest index for which we need to use the epilogue */ - imax = ibase + UNROLLI-1; - imax = (imax < end) ? imax : end; - - max_offset = calc_maxoffset(imax, natoms); - imax = imax + 1 + max_offset + UNROLLJ - 1; - - for (i = ibase; i < ibase+UNROLLI; i++) - { - /* Start of epilogue - round down to j tile limit */ - aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ; - /* Make sure we dont overlap - for small systems everything is done in the prologue */ - aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2]; - /* Round upwards to j tile limit */ - aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ; - /* Make sure we dont have a negative range for the epilogue */ - aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3]; - } - } - - /* And fill it with data... */ - - for (ibase = ni0; ibase < ni1; ibase += UNROLLI) - { - for (i = ibase; i < ibase+UNROLLI; i++) - { - - nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2]; - - /* Allocate aligned memory */ - snew(pi, nj+2*SIMD_WIDTH); - aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15))); - - max_offset = calc_maxoffset(i, natoms); - - for (k = 0; k < nj; k++) - { - j = aadata->jindex_gb[4*i+2] + k; - aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0; - } - } - } -} - - -static void -genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata, - gmx_localtop_t * top, - gmx_genborn_t * born, - t_mdatoms * mdatoms, - real radius_offset, - int gb_algorithm, - gmx_bool bInclude12, - gmx_bool bInclude13, - gmx_bool bInclude14) -{ - int i, j, idx; - int natoms; - gmx_allvsallgb2_data_t *aadata; - real *p; - - natoms = mdatoms->nr; - - snew(aadata, 1); - *p_aadata = aadata; - - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15))); - snew(p, 2*natoms+2*SIMD_WIDTH); - aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15))); - - snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH); - aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15))); - - snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH); - aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15))); - - snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH); - aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15))); - - for (i = 0; i < mdatoms->nr; i++) - { - aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset; - if (gb_algorithm == egbSTILL) - { - aadata->workparam[i] = born->vsolv[i]; - } - else if (gb_algorithm == egbOBC) - { - aadata->workparam[i] = born->param[i]; - } - aadata->work[i] = 0.0; - } - for (i = 0; i < mdatoms->nr; i++) - { - aadata->gb_radius[natoms+i] = aadata->gb_radius[i]; - aadata->workparam[natoms+i] = aadata->workparam[i]; - aadata->work[natoms+i] = aadata->work[i]; - } - - for (i = 0; i < 2*natoms+SIMD_WIDTH; i++) - { - aadata->x_align[i] = 0.0; - aadata->y_align[i] = 0.0; - aadata->z_align[i] = 0.0; - aadata->fx_align[i] = 0.0; - aadata->fy_align[i] = 0.0; - aadata->fz_align[i] = 0.0; - } - - setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr, - bInclude12, bInclude13, bInclude14); -} - - -int -genborn_allvsall_calc_still_radii_sse2_single(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - gmx_localtop_t * top, - real * x, - t_commrec * cr, - void * paadata) -{ - gmx_allvsallgb2_data_t *aadata; - int natoms; - int ni0, ni1; - int nj0, nj1, nj2, nj3; - int i, j, k, n; - int * mask; - int * pmask0; - int * pmask1; - int * pmask2; - int * pmask3; - int * emask0; - int * emask1; - int * emask2; - int * emask3; - real ix, iy, iz; - real jx, jy, jz; - real dx, dy, dz; - real rsq, rinv; - real gpi, rai, vai; - real prod_ai; - real irsq, idr4, idr6; - real raj, rvdw, ratio; - real vaj, ccf, dccf, theta, cosq; - real term, prod, icf4, icf6, gpi2, factor, sinq; - real * gb_radius; - real * vsolv; - real * work; - real tmpsum[4]; - real * x_align; - real * y_align; - real * z_align; - int * jindex; - real * dadx; - - __m128 ix_SSE0, iy_SSE0, iz_SSE0; - __m128 ix_SSE1, iy_SSE1, iz_SSE1; - __m128 ix_SSE2, iy_SSE2, iz_SSE2; - __m128 ix_SSE3, iy_SSE3, iz_SSE3; - __m128 gpi_SSE0, rai_SSE0, prod_ai_SSE0; - __m128 gpi_SSE1, rai_SSE1, prod_ai_SSE1; - __m128 gpi_SSE2, rai_SSE2, prod_ai_SSE2; - __m128 gpi_SSE3, rai_SSE3, prod_ai_SSE3; - __m128 imask_SSE0, jmask_SSE0; - __m128 imask_SSE1, jmask_SSE1; - __m128 imask_SSE2, jmask_SSE2; - __m128 imask_SSE3, jmask_SSE3; - __m128 jx_SSE, jy_SSE, jz_SSE; - __m128 dx_SSE0, dy_SSE0, dz_SSE0; - __m128 dx_SSE1, dy_SSE1, dz_SSE1; - __m128 dx_SSE2, dy_SSE2, dz_SSE2; - __m128 dx_SSE3, dy_SSE3, dz_SSE3; - __m128 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0; - __m128 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1; - __m128 rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2; - __m128 rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3; - __m128 raj_SSE, vaj_SSE, prod_SSE; - __m128 rvdw_SSE0, ratio_SSE0; - __m128 rvdw_SSE1, ratio_SSE1; - __m128 rvdw_SSE2, ratio_SSE2; - __m128 rvdw_SSE3, ratio_SSE3; - __m128 theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0; - __m128 theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1; - __m128 theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2; - __m128 theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3; - __m128 ccf_SSE0, dccf_SSE0; - __m128 ccf_SSE1, dccf_SSE1; - __m128 ccf_SSE2, dccf_SSE2; - __m128 ccf_SSE3, dccf_SSE3; - __m128 icf4_SSE0, icf6_SSE0; - __m128 icf4_SSE1, icf6_SSE1; - __m128 icf4_SSE2, icf6_SSE2; - __m128 icf4_SSE3, icf6_SSE3; - __m128 half_SSE, one_SSE, two_SSE, four_SSE; - __m128 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE; - - natoms = mdatoms->nr; - ni0 = 0; - ni1 = mdatoms->homenr; - - n = 0; - - aadata = *((gmx_allvsallgb2_data_t **)paadata); - - - if (aadata == NULL) - { - genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0, - egbSTILL, FALSE, FALSE, TRUE); - *((gmx_allvsallgb2_data_t **)paadata) = aadata; - } - - x_align = aadata->x_align; - y_align = aadata->y_align; - z_align = aadata->z_align; - - gb_radius = aadata->gb_radius; - vsolv = aadata->workparam; - work = aadata->work; - jindex = aadata->jindex_gb; - dadx = fr->dadx; - - still_p4_SSE = _mm_set1_ps(STILL_P4); - still_p5inv_SSE = _mm_set1_ps(STILL_P5INV); - still_pip5_SSE = _mm_set1_ps(STILL_PIP5); - half_SSE = _mm_set1_ps(0.5); - one_SSE = _mm_set1_ps(1.0); - two_SSE = _mm_set1_ps(2.0); - four_SSE = _mm_set1_ps(4.0); - - /* This will be summed, so it has to extend to natoms + buffer */ - for (i = 0; i < natoms+1+natoms/2; i++) - { - work[i] = 0; - } - - for (i = ni0; i < ni1+1+natoms/2; i++) - { - k = i%natoms; - x_align[i] = x[3*k]; - y_align[i] = x[3*k+1]; - z_align[i] = x[3*k+2]; - work[i] = 0; - } - - - for (i = ni0; i < ni1; i += UNROLLI) - { - /* We assume shifts are NOT used for all-vs-all interactions */ - - /* Load i atom data */ - ix_SSE0 = _mm_load1_ps(x_align+i); - iy_SSE0 = _mm_load1_ps(y_align+i); - iz_SSE0 = _mm_load1_ps(z_align+i); - ix_SSE1 = _mm_load1_ps(x_align+i+1); - iy_SSE1 = _mm_load1_ps(y_align+i+1); - iz_SSE1 = _mm_load1_ps(z_align+i+1); - ix_SSE2 = _mm_load1_ps(x_align+i+2); - iy_SSE2 = _mm_load1_ps(y_align+i+2); - iz_SSE2 = _mm_load1_ps(z_align+i+2); - ix_SSE3 = _mm_load1_ps(x_align+i+3); - iy_SSE3 = _mm_load1_ps(y_align+i+3); - iz_SSE3 = _mm_load1_ps(z_align+i+3); - - gpi_SSE0 = _mm_setzero_ps(); - gpi_SSE1 = _mm_setzero_ps(); - gpi_SSE2 = _mm_setzero_ps(); - gpi_SSE3 = _mm_setzero_ps(); - - rai_SSE0 = _mm_load1_ps(gb_radius+i); - rai_SSE1 = _mm_load1_ps(gb_radius+i+1); - rai_SSE2 = _mm_load1_ps(gb_radius+i+2); - rai_SSE3 = _mm_load1_ps(gb_radius+i+3); - - prod_ai_SSE0 = _mm_set1_ps(STILL_P4*vsolv[i]); - prod_ai_SSE1 = _mm_set1_ps(STILL_P4*vsolv[i+1]); - prod_ai_SSE2 = _mm_set1_ps(STILL_P4*vsolv[i+2]); - prod_ai_SSE3 = _mm_set1_ps(STILL_P4*vsolv[i+3]); - - /* Load limits for loop over neighbors */ - nj0 = jindex[4*i]; - nj1 = jindex[4*i+1]; - nj2 = jindex[4*i+2]; - nj3 = jindex[4*i+3]; - - pmask0 = aadata->prologue_mask_gb[i]; - pmask1 = aadata->prologue_mask_gb[i+1]; - pmask2 = aadata->prologue_mask_gb[i+2]; - pmask3 = aadata->prologue_mask_gb[i+3]; - emask0 = aadata->epilogue_mask[i]; - emask1 = aadata->epilogue_mask[i+1]; - emask2 = aadata->epilogue_mask[i+2]; - emask3 = aadata->epilogue_mask[i+3]; - - imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i)); - imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1)); - imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2)); - imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3)); - - /* Prologue part, including exclusion mask */ - for (j = nj0; j < nj1; j += UNROLLJ) - { - jmask_SSE0 = _mm_load_ps((real *)pmask0); - jmask_SSE1 = _mm_load_ps((real *)pmask1); - jmask_SSE2 = _mm_load_ps((real *)pmask2); - jmask_SSE3 = _mm_load_ps((real *)pmask3); - pmask0 += UNROLLJ; - pmask1 += UNROLLJ; - pmask2 += UNROLLJ; - pmask3 += UNROLLJ; - - /* load j atom coordinates */ - jx_SSE = _mm_load_ps(x_align+j); - jy_SSE = _mm_load_ps(y_align+j); - jz_SSE = _mm_load_ps(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE); - dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE); - dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE); - dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE); - dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE); - dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE); - dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3); - - /* Combine masks */ - jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0); - jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1); - jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2); - jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1); - rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2); - rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3); - - /* Apply mask */ - rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0); - rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1); - rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2); - rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3); - - irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0); - irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1); - irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2); - irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3); - idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0); - idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1); - idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2); - idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3); - idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0); - idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1); - idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2); - idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3); - - raj_SSE = _mm_load_ps(gb_radius+j); - vaj_SSE = _mm_load_ps(vsolv+j); - - rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE); - rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE); - rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE); - rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE); - - ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0))); - ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1))); - ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2))); - ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3))); - - ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE); - ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE); - ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE); - ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE); - theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE); - theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE); - theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE); - theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE); - gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0); - gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1); - gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2); - gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3); - term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0)); - term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1)); - term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2)); - term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3)); - ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0); - ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1); - ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2); - ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3); - dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0), - _mm_mul_ps(sinq_SSE0, theta_SSE0)); - dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1), - _mm_mul_ps(sinq_SSE1, theta_SSE1)); - dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2), - _mm_mul_ps(sinq_SSE2, theta_SSE2)); - dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3), - _mm_mul_ps(sinq_SSE3, theta_SSE3)); - - prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE); - icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0); - icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1); - icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2); - icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3); - icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0); - icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1); - icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2); - icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3); - - _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j), - gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0), - _mm_mul_ps(prod_ai_SSE1, icf4_SSE1), - _mm_mul_ps(prod_ai_SSE2, icf4_SSE2), - _mm_mul_ps(prod_ai_SSE3, icf4_SSE3)))); - - gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0)); - gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1)); - gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2)); - gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3)); - - /* Save ai->aj and aj->ai chain rule terms */ - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3)); - dadx += 4; - - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3)); - dadx += 4; - } - - /* Main part, no exclusions */ - for (j = nj1; j < nj2; j += UNROLLJ) - { - /* load j atom coordinates */ - jx_SSE = _mm_load_ps(x_align+j); - jy_SSE = _mm_load_ps(y_align+j); - jz_SSE = _mm_load_ps(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE); - dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE); - dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE); - dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE); - dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE); - dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE); - dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1); - rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2); - rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3); - - /* Apply mask */ - rinv_SSE0 = _mm_and_ps(rinv_SSE0, imask_SSE0); - rinv_SSE1 = _mm_and_ps(rinv_SSE1, imask_SSE1); - rinv_SSE2 = _mm_and_ps(rinv_SSE2, imask_SSE2); - rinv_SSE3 = _mm_and_ps(rinv_SSE3, imask_SSE3); - - irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0); - irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1); - irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2); - irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3); - idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0); - idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1); - idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2); - idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3); - idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0); - idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1); - idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2); - idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3); - - raj_SSE = _mm_load_ps(gb_radius+j); - - rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE); - rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE); - rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE); - rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE); - vaj_SSE = _mm_load_ps(vsolv+j); - - ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0))); - ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1))); - ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2))); - ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3))); - - ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE); - ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE); - ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE); - ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE); - theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE); - theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE); - theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE); - theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE); - gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0); - gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1); - gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2); - gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3); - term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0)); - term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1)); - term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2)); - term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3)); - ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0); - ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1); - ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2); - ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3); - dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0), - _mm_mul_ps(sinq_SSE0, theta_SSE0)); - dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1), - _mm_mul_ps(sinq_SSE1, theta_SSE1)); - dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2), - _mm_mul_ps(sinq_SSE2, theta_SSE2)); - dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3), - _mm_mul_ps(sinq_SSE3, theta_SSE3)); - - prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE ); - icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0); - icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1); - icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2); - icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3); - icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0); - icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1); - icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2); - icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3); - - _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j), - gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0), - _mm_mul_ps(prod_ai_SSE1, icf4_SSE1), - _mm_mul_ps(prod_ai_SSE2, icf4_SSE2), - _mm_mul_ps(prod_ai_SSE3, icf4_SSE3)))); - - gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0)); - gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1)); - gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2)); - gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3)); - - /* Save ai->aj and aj->ai chain rule terms */ - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3)); - dadx += 4; - - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3)); - dadx += 4; - } - /* Epilogue part, including exclusion mask */ - for (j = nj2; j < nj3; j += UNROLLJ) - { - jmask_SSE0 = _mm_load_ps((real *)emask0); - jmask_SSE1 = _mm_load_ps((real *)emask1); - jmask_SSE2 = _mm_load_ps((real *)emask2); - jmask_SSE3 = _mm_load_ps((real *)emask3); - emask0 += UNROLLJ; - emask1 += UNROLLJ; - emask2 += UNROLLJ; - emask3 += UNROLLJ; - - /* load j atom coordinates */ - jx_SSE = _mm_load_ps(x_align+j); - jy_SSE = _mm_load_ps(y_align+j); - jz_SSE = _mm_load_ps(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE); - dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE); - dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE); - dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE); - dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE); - dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE); - dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3); - - /* Combine masks */ - jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0); - jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1); - jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2); - jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1); - rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2); - rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3); - - /* Apply mask */ - rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0); - rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1); - rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2); - rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3); - - irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0); - irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1); - irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2); - irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3); - idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0); - idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1); - idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2); - idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3); - idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0); - idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1); - idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2); - idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3); - - raj_SSE = _mm_load_ps(gb_radius+j); - vaj_SSE = _mm_load_ps(vsolv+j); - - rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE); - rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE); - rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE); - rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE); - - ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0))); - ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1))); - ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2))); - ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3))); - - ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE); - ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE); - ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE); - ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE); - theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE); - theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE); - theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE); - theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE); - gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0); - gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1); - gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2); - gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3); - term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0)); - term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1)); - term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2)); - term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3)); - ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0); - ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1); - ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2); - ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3); - dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0), - _mm_mul_ps(sinq_SSE0, theta_SSE0)); - dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1), - _mm_mul_ps(sinq_SSE1, theta_SSE1)); - dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2), - _mm_mul_ps(sinq_SSE2, theta_SSE2)); - dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3), - _mm_mul_ps(sinq_SSE3, theta_SSE3)); - - prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE); - icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0); - icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1); - icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2); - icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3); - icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0); - icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1); - icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2); - icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3); - - _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j), - gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0), - _mm_mul_ps(prod_ai_SSE1, icf4_SSE1), - _mm_mul_ps(prod_ai_SSE2, icf4_SSE2), - _mm_mul_ps(prod_ai_SSE3, icf4_SSE3)))); - - gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0)); - gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1)); - gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2)); - gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3)); - - /* Save ai->aj and aj->ai chain rule terms */ - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3)); - dadx += 4; - - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3)); - dadx += 4; - } - _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3); - gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1); - gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3); - gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2); - _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i))); - } - - /* In case we have written anything beyond natoms, move it back. - * Never mind that we leave stuff above natoms; that will not - * be accessed later in the routine. - * In principle this should be a move rather than sum, but this - * way we dont have to worry about even/odd offsets... - */ - for (i = natoms; i < ni1+1+natoms/2; i++) - { - work[i-natoms] += work[i]; - } - - /* Parallel summations would go here if ever implemented with DD */ - - factor = 0.5 * ONE_4PI_EPS0; - /* Calculate the radii - should we do all atoms, or just our local ones? */ - for (i = 0; i < natoms; i++) - { - if (born->use[i] != 0) - { - gpi = born->gpol[i]+work[i]; - gpi2 = gpi * gpi; - born->bRad[i] = factor*gmx_invsqrt(gpi2); - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - } - } - - return 0; -} - - - -int -genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - int gb_algorithm, - gmx_localtop_t * top, - real * x, - t_commrec * cr, - void * paadata) -{ - gmx_allvsallgb2_data_t *aadata; - int natoms; - int ni0, ni1; - int nj0, nj1, nj2, nj3; - int i, j, k, n; - int * mask; - int * pmask0; - int * pmask1; - int * pmask2; - int * pmask3; - int * emask0; - int * emask1; - int * emask2; - int * emask3; - real * gb_radius; - real * vsolv; - real * work; - real tmpsum[4]; - real * x_align; - real * y_align; - real * z_align; - int * jindex; - real * dadx; - real * obc_param; - real rad, min_rad; - real rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain; - - __m128 ix_SSE0, iy_SSE0, iz_SSE0; - __m128 ix_SSE1, iy_SSE1, iz_SSE1; - __m128 ix_SSE2, iy_SSE2, iz_SSE2; - __m128 ix_SSE3, iy_SSE3, iz_SSE3; - __m128 gpi_SSE0, rai_SSE0, prod_ai_SSE0; - __m128 gpi_SSE1, rai_SSE1, prod_ai_SSE1; - __m128 gpi_SSE2, rai_SSE2, prod_ai_SSE2; - __m128 gpi_SSE3, rai_SSE3, prod_ai_SSE3; - __m128 imask_SSE0, jmask_SSE0; - __m128 imask_SSE1, jmask_SSE1; - __m128 imask_SSE2, jmask_SSE2; - __m128 imask_SSE3, jmask_SSE3; - __m128 jx_SSE, jy_SSE, jz_SSE; - __m128 dx_SSE0, dy_SSE0, dz_SSE0; - __m128 dx_SSE1, dy_SSE1, dz_SSE1; - __m128 dx_SSE2, dy_SSE2, dz_SSE2; - __m128 dx_SSE3, dy_SSE3, dz_SSE3; - __m128 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0; - __m128 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1; - __m128 rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2; - __m128 rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3; - __m128 raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE; - __m128 ccf_SSE0, dccf_SSE0, prod_SSE0; - __m128 ccf_SSE1, dccf_SSE1, prod_SSE1; - __m128 ccf_SSE2, dccf_SSE2, prod_SSE2; - __m128 ccf_SSE3, dccf_SSE3, prod_SSE3; - __m128 icf4_SSE0, icf6_SSE0; - __m128 icf4_SSE1, icf6_SSE1; - __m128 icf4_SSE2, icf6_SSE2; - __m128 icf4_SSE3, icf6_SSE3; - __m128 oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE; - __m128 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE; - __m128 rai_inv_SSE0; - __m128 rai_inv_SSE1; - __m128 rai_inv_SSE2; - __m128 rai_inv_SSE3; - __m128 sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0; - __m128 sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1; - __m128 sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2; - __m128 sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3; - __m128 lij_inv_SSE0, sk2_rinv_SSE0; - __m128 lij_inv_SSE1, sk2_rinv_SSE1; - __m128 lij_inv_SSE2, sk2_rinv_SSE2; - __m128 lij_inv_SSE3, sk2_rinv_SSE3; - __m128 dr_SSE0; - __m128 dr_SSE1; - __m128 dr_SSE2; - __m128 dr_SSE3; - __m128 t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0; - __m128 t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1; - __m128 t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2; - __m128 t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3; - __m128 obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0; - __m128 obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1; - __m128 obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2; - __m128 obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3; - __m128 uij_SSE0, uij2_SSE0, uij3_SSE0; - __m128 uij_SSE1, uij2_SSE1, uij3_SSE1; - __m128 uij_SSE2, uij2_SSE2, uij3_SSE2; - __m128 uij_SSE3, uij2_SSE3, uij3_SSE3; - __m128 lij_SSE0, lij2_SSE0, lij3_SSE0; - __m128 lij_SSE1, lij2_SSE1, lij3_SSE1; - __m128 lij_SSE2, lij2_SSE2, lij3_SSE2; - __m128 lij_SSE3, lij2_SSE3, lij3_SSE3; - __m128 dlij_SSE0, diff2_SSE0, logterm_SSE0; - __m128 dlij_SSE1, diff2_SSE1, logterm_SSE1; - __m128 dlij_SSE2, diff2_SSE2, logterm_SSE2; - __m128 dlij_SSE3, diff2_SSE3, logterm_SSE3; - __m128 doffset_SSE; - - natoms = mdatoms->nr; - ni0 = 0; - ni1 = mdatoms->homenr; - - n = 0; - - aadata = *((gmx_allvsallgb2_data_t **)paadata); - - - if (aadata == NULL) - { - genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset, - egbOBC, TRUE, TRUE, TRUE); - *((gmx_allvsallgb2_data_t **)paadata) = aadata; - } - - x_align = aadata->x_align; - y_align = aadata->y_align; - z_align = aadata->z_align; - - gb_radius = aadata->gb_radius; - work = aadata->work; - jindex = aadata->jindex_gb; - dadx = fr->dadx; - obc_param = aadata->workparam; - - oneeighth_SSE = _mm_set1_ps(0.125); - onefourth_SSE = _mm_set1_ps(0.25); - half_SSE = _mm_set1_ps(0.5); - one_SSE = _mm_set1_ps(1.0); - two_SSE = _mm_set1_ps(2.0); - four_SSE = _mm_set1_ps(4.0); - doffset_SSE = _mm_set1_ps(born->gb_doffset); - - for (i = 0; i < natoms; i++) - { - x_align[i] = x[3*i]; - y_align[i] = x[3*i+1]; - z_align[i] = x[3*i+2]; - } - - /* Copy again */ - for (i = 0; i < natoms/2+1; i++) - { - x_align[natoms+i] = x_align[i]; - y_align[natoms+i] = y_align[i]; - z_align[natoms+i] = z_align[i]; - } - - for (i = 0; i < natoms+natoms/2+1; i++) - { - work[i] = 0; - } - - for (i = ni0; i < ni1; i += UNROLLI) - { - /* We assume shifts are NOT used for all-vs-all interactions */ - - /* Load i atom data */ - ix_SSE0 = _mm_load1_ps(x_align+i); - iy_SSE0 = _mm_load1_ps(y_align+i); - iz_SSE0 = _mm_load1_ps(z_align+i); - ix_SSE1 = _mm_load1_ps(x_align+i+1); - iy_SSE1 = _mm_load1_ps(y_align+i+1); - iz_SSE1 = _mm_load1_ps(z_align+i+1); - ix_SSE2 = _mm_load1_ps(x_align+i+2); - iy_SSE2 = _mm_load1_ps(y_align+i+2); - iz_SSE2 = _mm_load1_ps(z_align+i+2); - ix_SSE3 = _mm_load1_ps(x_align+i+3); - iy_SSE3 = _mm_load1_ps(y_align+i+3); - iz_SSE3 = _mm_load1_ps(z_align+i+3); - - rai_SSE0 = _mm_load1_ps(gb_radius+i); - rai_SSE1 = _mm_load1_ps(gb_radius+i+1); - rai_SSE2 = _mm_load1_ps(gb_radius+i+2); - rai_SSE3 = _mm_load1_ps(gb_radius+i+3); - rai_inv_SSE0 = gmx_mm_inv_ps(rai_SSE0); - rai_inv_SSE1 = gmx_mm_inv_ps(rai_SSE1); - rai_inv_SSE2 = gmx_mm_inv_ps(rai_SSE2); - rai_inv_SSE3 = gmx_mm_inv_ps(rai_SSE3); - - sk_ai_SSE0 = _mm_load1_ps(obc_param+i); - sk_ai_SSE1 = _mm_load1_ps(obc_param+i+1); - sk_ai_SSE2 = _mm_load1_ps(obc_param+i+2); - sk_ai_SSE3 = _mm_load1_ps(obc_param+i+3); - sk2_ai_SSE0 = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0); - sk2_ai_SSE1 = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1); - sk2_ai_SSE2 = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2); - sk2_ai_SSE3 = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3); - - sum_ai_SSE0 = _mm_setzero_ps(); - sum_ai_SSE1 = _mm_setzero_ps(); - sum_ai_SSE2 = _mm_setzero_ps(); - sum_ai_SSE3 = _mm_setzero_ps(); - - /* Load limits for loop over neighbors */ - nj0 = jindex[4*i]; - nj1 = jindex[4*i+1]; - nj2 = jindex[4*i+2]; - nj3 = jindex[4*i+3]; - - pmask0 = aadata->prologue_mask_gb[i]; - pmask1 = aadata->prologue_mask_gb[i+1]; - pmask2 = aadata->prologue_mask_gb[i+2]; - pmask3 = aadata->prologue_mask_gb[i+3]; - emask0 = aadata->epilogue_mask[i]; - emask1 = aadata->epilogue_mask[i+1]; - emask2 = aadata->epilogue_mask[i+2]; - emask3 = aadata->epilogue_mask[i+3]; - - imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i)); - imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1)); - imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2)); - imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3)); - - /* Prologue part, including exclusion mask */ - for (j = nj0; j < nj1; j += UNROLLJ) - { - jmask_SSE0 = _mm_load_ps((real *)pmask0); - jmask_SSE1 = _mm_load_ps((real *)pmask1); - jmask_SSE2 = _mm_load_ps((real *)pmask2); - jmask_SSE3 = _mm_load_ps((real *)pmask3); - pmask0 += UNROLLJ; - pmask1 += UNROLLJ; - pmask2 += UNROLLJ; - pmask3 += UNROLLJ; - - /* load j atom coordinates */ - jx_SSE = _mm_load_ps(x_align+j); - jy_SSE = _mm_load_ps(y_align+j); - jz_SSE = _mm_load_ps(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE); - dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE); - dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE); - dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE); - dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE); - dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE); - dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3); - - /* Combine masks */ - jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0); - jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1); - jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2); - jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1); - rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2); - rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3); - - /* Apply mask */ - rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0); - rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1); - rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2); - rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3); - - dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0); - dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1); - dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2); - dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3); - - sk_aj_SSE = _mm_load_ps(obc_param+j); - raj_SSE = _mm_load_ps(gb_radius+j); - raj_inv_SSE = gmx_mm_inv_ps(raj_SSE); - - /* Evaluate influence of atom aj -> ai */ - t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE); - t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE); - t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE); - t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE); - t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE); - t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE); - t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE); - t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE); - t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0); - t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1); - t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2); - t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3); - - obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1); - obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2); - obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3); - obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1); - obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2); - obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3); - obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1); - obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2); - obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3); - obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0); - obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1); - obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2); - obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3); - - uij_SSE0 = gmx_mm_inv_ps(t1_SSE0); - uij_SSE1 = gmx_mm_inv_ps(t1_SSE1); - uij_SSE2 = gmx_mm_inv_ps(t1_SSE2); - uij_SSE3 = gmx_mm_inv_ps(t1_SSE3); - lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)), - _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0)); - lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)), - _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1)); - lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)), - _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2)); - lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)), - _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3)); - dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1); - dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2); - dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3); - - uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1); - uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2); - uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3); - uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1); - uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2); - uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3); - lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1); - lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2); - lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3); - lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1); - lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2); - lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3); - - diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1); - diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2); - diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3); - lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1); - lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2); - lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3); - sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE); - sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1); - sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2); - sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3); - prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1); - prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2); - prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3); - - logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1)); - logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2)); - logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3)); - - t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1); - t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2); - t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3); - t2_SSE0 = _mm_mul_ps(diff2_SSE0, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_ps(diff2_SSE1, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t2_SSE2 = _mm_mul_ps(diff2_SSE2, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2), - prod_SSE2)); - t2_SSE3 = _mm_mul_ps(diff2_SSE3, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3), - prod_SSE3)); - - t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1)); - t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2)); - t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3)); - t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1)); - t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2)); - t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3)); - t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0)); - t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1)); - t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2)); - t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3)); - t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1); - t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2); - t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3); - t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1)); - t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2)); - t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3)); - - sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - - t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0), - _mm_mul_ps(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1), - _mm_mul_ps(prod_SSE1, lij3_SSE1)); - t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2), - _mm_mul_ps(prod_SSE2, lij3_SSE2)); - t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3), - _mm_mul_ps(prod_SSE3, lij3_SSE3)); - t1_SSE0 = _mm_sub_ps(t1_SSE0, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0), - _mm_mul_ps(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_ps(t1_SSE1, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1), - _mm_mul_ps(lij3_SSE1, dr_SSE1)))); - t1_SSE2 = _mm_sub_ps(t1_SSE2, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2), - _mm_mul_ps(lij3_SSE2, dr_SSE2)))); - t1_SSE3 = _mm_sub_ps(t1_SSE3, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3), - _mm_mul_ps(lij3_SSE3, dr_SSE3)))); - - t2_SSE0 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0), - _mm_mul_ps(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1), - _mm_mul_ps(uij3_SSE1, dr_SSE1))); - t2_SSE2 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2), - _mm_mul_ps(uij3_SSE2, dr_SSE2))); - t2_SSE3 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3), - _mm_mul_ps(uij3_SSE3, dr_SSE3))); - t2_SSE0 = _mm_sub_ps(t2_SSE0, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0), - _mm_mul_ps(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_ps(t2_SSE1, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1), - _mm_mul_ps(prod_SSE1, uij3_SSE1))); - t2_SSE2 = _mm_sub_ps(t2_SSE2, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2), - _mm_mul_ps(prod_SSE2, uij3_SSE2))); - t2_SSE3 = _mm_sub_ps(t2_SSE3, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3), - _mm_mul_ps(prod_SSE3, uij3_SSE3))); - t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0), - _mm_mul_ps(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1), - _mm_mul_ps(rinv_SSE1, rinv_SSE1)); - t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2), - _mm_mul_ps(rinv_SSE2, rinv_SSE2)); - t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3), - _mm_mul_ps(rinv_SSE3, rinv_SSE3)); - t3_SSE0 = _mm_sub_ps(t3_SSE0, - _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_ps(t3_SSE1, - _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1)))); - t3_SSE2 = _mm_sub_ps(t3_SSE2, - _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2)))); - t3_SSE3 = _mm_sub_ps(t3_SSE3, - _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3)))); - - t1_SSE0 = _mm_mul_ps(rinv_SSE0, - _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0), - _mm_add_ps(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_ps(rinv_SSE1, - _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1), - _mm_add_ps(t2_SSE1, t3_SSE1))); - t1_SSE2 = _mm_mul_ps(rinv_SSE2, - _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2), - _mm_add_ps(t2_SSE2, t3_SSE2))); - t1_SSE3 = _mm_mul_ps(rinv_SSE3, - _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3), - _mm_add_ps(t2_SSE3, t3_SSE3))); - - _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - dadx += 4; - - /* Evaluate influence of atom ai -> aj */ - t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0); - t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1); - t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2); - t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3); - t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0); - t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1); - t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2); - t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3); - t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0); - t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1); - t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2); - t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3); - - obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1); - obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2); - obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3); - obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1); - obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2); - obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3); - obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1); - obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2); - obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3); - obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0); - obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1); - obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2); - obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3); - - uij_SSE0 = gmx_mm_inv_ps(t1_SSE0); - uij_SSE1 = gmx_mm_inv_ps(t1_SSE1); - uij_SSE2 = gmx_mm_inv_ps(t1_SSE2); - uij_SSE3 = gmx_mm_inv_ps(t1_SSE3); - lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)), - _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE)); - lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)), - _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE)); - lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)), - _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE)); - lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)), - _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE)); - dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1); - dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2); - dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3); - - uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1); - uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2); - uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3); - uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1); - uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2); - uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3); - lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1); - lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2); - lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3); - lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1); - lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2); - lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3); - - diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1); - diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2); - diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3); - lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1); - lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2); - lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3); - sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1); - sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2); - sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3); - prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1); - prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2); - prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3); - - logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1)); - logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2)); - logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3)); - t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1); - t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2); - t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3); - t2_SSE0 = _mm_mul_ps(diff2_SSE0, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_ps(diff2_SSE1, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t2_SSE2 = _mm_mul_ps(diff2_SSE2, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2), - prod_SSE2)); - t2_SSE3 = _mm_mul_ps(diff2_SSE3, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3), - prod_SSE3)); - t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1)); - t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2)); - t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3)); - t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1)); - t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2)); - t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3)); - t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0)); - t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1)); - t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2)); - t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3)); - t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1); - t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2); - t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3); - t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1)); - t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2)); - t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3)); - - _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j), - gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0), - _mm_and_ps(t1_SSE1, obc_mask1_SSE1), - _mm_and_ps(t1_SSE2, obc_mask1_SSE2), - _mm_and_ps(t1_SSE3, obc_mask1_SSE3)))); - - t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0), - _mm_mul_ps(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1), - _mm_mul_ps(prod_SSE1, lij3_SSE1)); - t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2), - _mm_mul_ps(prod_SSE2, lij3_SSE2)); - t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3), - _mm_mul_ps(prod_SSE3, lij3_SSE3)); - t1_SSE0 = _mm_sub_ps(t1_SSE0, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0), - _mm_mul_ps(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_ps(t1_SSE1, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1), - _mm_mul_ps(lij3_SSE1, dr_SSE1)))); - t1_SSE2 = _mm_sub_ps(t1_SSE2, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2), - _mm_mul_ps(lij3_SSE2, dr_SSE2)))); - t1_SSE3 = _mm_sub_ps(t1_SSE3, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3), - _mm_mul_ps(lij3_SSE3, dr_SSE3)))); - t2_SSE0 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0), - _mm_mul_ps(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1), - _mm_mul_ps(uij3_SSE1, dr_SSE1))); - t2_SSE2 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2), - _mm_mul_ps(uij3_SSE2, dr_SSE2))); - t2_SSE3 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3), - _mm_mul_ps(uij3_SSE3, dr_SSE3))); - t2_SSE0 = _mm_sub_ps(t2_SSE0, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0), - _mm_mul_ps(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_ps(t2_SSE1, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1), - _mm_mul_ps(prod_SSE1, uij3_SSE1))); - t2_SSE2 = _mm_sub_ps(t2_SSE2, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2), - _mm_mul_ps(prod_SSE2, uij3_SSE2))); - t2_SSE3 = _mm_sub_ps(t2_SSE3, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3), - _mm_mul_ps(prod_SSE3, uij3_SSE3))); - - t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0), - _mm_mul_ps(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1), - _mm_mul_ps(rinv_SSE1, rinv_SSE1)); - t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2), - _mm_mul_ps(rinv_SSE2, rinv_SSE2)); - t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3), - _mm_mul_ps(rinv_SSE3, rinv_SSE3)); - - t3_SSE0 = _mm_sub_ps(t3_SSE0, - _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_ps(t3_SSE1, - _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1)))); - t3_SSE2 = _mm_sub_ps(t3_SSE2, - _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2)))); - t3_SSE3 = _mm_sub_ps(t3_SSE3, - _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3)))); - - - t1_SSE0 = _mm_mul_ps(rinv_SSE0, - _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0), - _mm_add_ps(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_ps(rinv_SSE1, - _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1), - _mm_add_ps(t2_SSE1, t3_SSE1))); - t1_SSE2 = _mm_mul_ps(rinv_SSE2, - _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2), - _mm_add_ps(t2_SSE2, t3_SSE2))); - t1_SSE3 = _mm_mul_ps(rinv_SSE3, - _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3), - _mm_add_ps(t2_SSE3, t3_SSE3))); - - _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - dadx += 4; - - } - - /* Main part, no exclusions */ - for (j = nj1; j < nj2; j += UNROLLJ) - { - /* load j atom coordinates */ - jx_SSE = _mm_load_ps(x_align+j); - jy_SSE = _mm_load_ps(y_align+j); - jz_SSE = _mm_load_ps(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE); - dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE); - dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE); - dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE); - dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE); - dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE); - dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1); - rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2); - rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3); - - /* Apply mask */ - rinv_SSE0 = _mm_and_ps(rinv_SSE0, imask_SSE0); - rinv_SSE1 = _mm_and_ps(rinv_SSE1, imask_SSE1); - rinv_SSE2 = _mm_and_ps(rinv_SSE2, imask_SSE2); - rinv_SSE3 = _mm_and_ps(rinv_SSE3, imask_SSE3); - - dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0); - dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1); - dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2); - dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3); - - sk_aj_SSE = _mm_load_ps(obc_param+j); - raj_SSE = _mm_load_ps(gb_radius+j); - - raj_inv_SSE = gmx_mm_inv_ps(raj_SSE); - - /* Evaluate influence of atom aj -> ai */ - t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE); - t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE); - t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE); - t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE); - t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE); - t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE); - t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE); - t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE); - t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0); - t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1); - t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2); - t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3); - - obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1); - obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2); - obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3); - obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1); - obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2); - obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3); - obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1); - obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2); - obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3); - obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, imask_SSE0); - obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, imask_SSE1); - obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, imask_SSE2); - obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, imask_SSE3); - - uij_SSE0 = gmx_mm_inv_ps(t1_SSE0); - uij_SSE1 = gmx_mm_inv_ps(t1_SSE1); - uij_SSE2 = gmx_mm_inv_ps(t1_SSE2); - uij_SSE3 = gmx_mm_inv_ps(t1_SSE3); - lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)), - _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0)); - lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)), - _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1)); - lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)), - _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2)); - lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)), - _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3)); - dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1); - dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2); - dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3); - - uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1); - uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2); - uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3); - uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1); - uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2); - uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3); - lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1); - lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2); - lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3); - lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1); - lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2); - lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3); - - diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1); - diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2); - diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3); - lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1); - lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2); - lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3); - sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE); - sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1); - sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2); - sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3); - prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1); - prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2); - prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3); - - logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1)); - logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2)); - logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3)); - - t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1); - t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2); - t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3); - t2_SSE0 = _mm_mul_ps(diff2_SSE0, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_ps(diff2_SSE1, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t2_SSE2 = _mm_mul_ps(diff2_SSE2, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2), - prod_SSE2)); - t2_SSE3 = _mm_mul_ps(diff2_SSE3, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3), - prod_SSE3)); - - t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1)); - t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2)); - t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3)); - t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1)); - t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2)); - t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3)); - t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0)); - t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1)); - t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2)); - t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3)); - t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1); - t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2); - t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3); - t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1)); - t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2)); - t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3)); - - sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - - t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0), - _mm_mul_ps(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1), - _mm_mul_ps(prod_SSE1, lij3_SSE1)); - t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2), - _mm_mul_ps(prod_SSE2, lij3_SSE2)); - t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3), - _mm_mul_ps(prod_SSE3, lij3_SSE3)); - t1_SSE0 = _mm_sub_ps(t1_SSE0, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0), - _mm_mul_ps(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_ps(t1_SSE1, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1), - _mm_mul_ps(lij3_SSE1, dr_SSE1)))); - t1_SSE2 = _mm_sub_ps(t1_SSE2, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2), - _mm_mul_ps(lij3_SSE2, dr_SSE2)))); - t1_SSE3 = _mm_sub_ps(t1_SSE3, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3), - _mm_mul_ps(lij3_SSE3, dr_SSE3)))); - - t2_SSE0 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0), - _mm_mul_ps(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1), - _mm_mul_ps(uij3_SSE1, dr_SSE1))); - t2_SSE2 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2), - _mm_mul_ps(uij3_SSE2, dr_SSE2))); - t2_SSE3 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3), - _mm_mul_ps(uij3_SSE3, dr_SSE3))); - t2_SSE0 = _mm_sub_ps(t2_SSE0, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0), - _mm_mul_ps(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_ps(t2_SSE1, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1), - _mm_mul_ps(prod_SSE1, uij3_SSE1))); - t2_SSE2 = _mm_sub_ps(t2_SSE2, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2), - _mm_mul_ps(prod_SSE2, uij3_SSE2))); - t2_SSE3 = _mm_sub_ps(t2_SSE3, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3), - _mm_mul_ps(prod_SSE3, uij3_SSE3))); - t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0), - _mm_mul_ps(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1), - _mm_mul_ps(rinv_SSE1, rinv_SSE1)); - t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2), - _mm_mul_ps(rinv_SSE2, rinv_SSE2)); - t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3), - _mm_mul_ps(rinv_SSE3, rinv_SSE3)); - t3_SSE0 = _mm_sub_ps(t3_SSE0, - _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_ps(t3_SSE1, - _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1)))); - t3_SSE2 = _mm_sub_ps(t3_SSE2, - _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2)))); - t3_SSE3 = _mm_sub_ps(t3_SSE3, - _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3)))); - - t1_SSE0 = _mm_mul_ps(rinv_SSE0, - _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0), - _mm_add_ps(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_ps(rinv_SSE1, - _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1), - _mm_add_ps(t2_SSE1, t3_SSE1))); - t1_SSE2 = _mm_mul_ps(rinv_SSE2, - _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2), - _mm_add_ps(t2_SSE2, t3_SSE2))); - t1_SSE3 = _mm_mul_ps(rinv_SSE3, - _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3), - _mm_add_ps(t2_SSE3, t3_SSE3))); - - _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - dadx += 4; - - /* Evaluate influence of atom ai -> aj */ - t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0); - t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1); - t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2); - t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3); - t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0); - t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1); - t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2); - t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3); - t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0); - t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1); - t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2); - t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3); - - obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1); - obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2); - obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3); - obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1); - obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2); - obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3); - obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1); - obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2); - obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3); - obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, imask_SSE0); - obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, imask_SSE1); - obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, imask_SSE2); - obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, imask_SSE3); - - uij_SSE0 = gmx_mm_inv_ps(t1_SSE0); - uij_SSE1 = gmx_mm_inv_ps(t1_SSE1); - uij_SSE2 = gmx_mm_inv_ps(t1_SSE2); - uij_SSE3 = gmx_mm_inv_ps(t1_SSE3); - lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)), - _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE)); - lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)), - _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE)); - lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)), - _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE)); - lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)), - _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE)); - dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1); - dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2); - dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3); - - uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1); - uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2); - uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3); - uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1); - uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2); - uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3); - lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1); - lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2); - lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3); - lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1); - lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2); - lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3); - - diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1); - diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2); - diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3); - lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1); - lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2); - lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3); - sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1); - sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2); - sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3); - prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1); - prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2); - prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3); - - logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1)); - logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2)); - logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3)); - t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1); - t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2); - t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3); - t2_SSE0 = _mm_mul_ps(diff2_SSE0, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_ps(diff2_SSE1, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t2_SSE2 = _mm_mul_ps(diff2_SSE2, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2), - prod_SSE2)); - t2_SSE3 = _mm_mul_ps(diff2_SSE3, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3), - prod_SSE3)); - t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1)); - t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2)); - t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3)); - t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1)); - t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2)); - t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3)); - t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0)); - t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1)); - t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2)); - t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3)); - t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1); - t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2); - t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3); - t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1)); - t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2)); - t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3)); - - _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j), - gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0), - _mm_and_ps(t1_SSE1, obc_mask1_SSE1), - _mm_and_ps(t1_SSE2, obc_mask1_SSE2), - _mm_and_ps(t1_SSE3, obc_mask1_SSE3)))); - - t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0), - _mm_mul_ps(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1), - _mm_mul_ps(prod_SSE1, lij3_SSE1)); - t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2), - _mm_mul_ps(prod_SSE2, lij3_SSE2)); - t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3), - _mm_mul_ps(prod_SSE3, lij3_SSE3)); - t1_SSE0 = _mm_sub_ps(t1_SSE0, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0), - _mm_mul_ps(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_ps(t1_SSE1, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1), - _mm_mul_ps(lij3_SSE1, dr_SSE1)))); - t1_SSE2 = _mm_sub_ps(t1_SSE2, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2), - _mm_mul_ps(lij3_SSE2, dr_SSE2)))); - t1_SSE3 = _mm_sub_ps(t1_SSE3, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3), - _mm_mul_ps(lij3_SSE3, dr_SSE3)))); - t2_SSE0 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0), - _mm_mul_ps(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1), - _mm_mul_ps(uij3_SSE1, dr_SSE1))); - t2_SSE2 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2), - _mm_mul_ps(uij3_SSE2, dr_SSE2))); - t2_SSE3 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3), - _mm_mul_ps(uij3_SSE3, dr_SSE3))); - t2_SSE0 = _mm_sub_ps(t2_SSE0, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0), - _mm_mul_ps(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_ps(t2_SSE1, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1), - _mm_mul_ps(prod_SSE1, uij3_SSE1))); - t2_SSE2 = _mm_sub_ps(t2_SSE2, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2), - _mm_mul_ps(prod_SSE2, uij3_SSE2))); - t2_SSE3 = _mm_sub_ps(t2_SSE3, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3), - _mm_mul_ps(prod_SSE3, uij3_SSE3))); - - t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0), - _mm_mul_ps(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1), - _mm_mul_ps(rinv_SSE1, rinv_SSE1)); - t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2), - _mm_mul_ps(rinv_SSE2, rinv_SSE2)); - t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3), - _mm_mul_ps(rinv_SSE3, rinv_SSE3)); - - t3_SSE0 = _mm_sub_ps(t3_SSE0, - _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_ps(t3_SSE1, - _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1)))); - t3_SSE2 = _mm_sub_ps(t3_SSE2, - _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2)))); - t3_SSE3 = _mm_sub_ps(t3_SSE3, - _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3)))); - - t1_SSE0 = _mm_mul_ps(rinv_SSE0, - _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0), - _mm_add_ps(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_ps(rinv_SSE1, - _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1), - _mm_add_ps(t2_SSE1, t3_SSE1))); - t1_SSE2 = _mm_mul_ps(rinv_SSE2, - _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2), - _mm_add_ps(t2_SSE2, t3_SSE2))); - t1_SSE3 = _mm_mul_ps(rinv_SSE3, - _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3), - _mm_add_ps(t2_SSE3, t3_SSE3))); - - _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - dadx += 4; - } - - /* Epilogue part, including exclusion mask */ - for (j = nj2; j < nj3; j += UNROLLJ) - { - jmask_SSE0 = _mm_load_ps((real *)emask0); - jmask_SSE1 = _mm_load_ps((real *)emask1); - jmask_SSE2 = _mm_load_ps((real *)emask2); - jmask_SSE3 = _mm_load_ps((real *)emask3); - emask0 += UNROLLJ; - emask1 += UNROLLJ; - emask2 += UNROLLJ; - emask3 += UNROLLJ; - - /* load j atom coordinates */ - jx_SSE = _mm_load_ps(x_align+j); - jy_SSE = _mm_load_ps(y_align+j); - jz_SSE = _mm_load_ps(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE); - dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE); - dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE); - dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE); - dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE); - dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE); - dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE); - - /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3); - - /* Combine masks */ - jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0); - jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1); - jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2); - jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3); - - /* Calculate 1/r and 1/r2 */ - rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0); - rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1); - rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2); - rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3); - - /* Apply mask */ - rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0); - rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1); - rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2); - rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3); - - dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0); - dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1); - dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2); - dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3); - - sk_aj_SSE = _mm_load_ps(obc_param+j); - raj_SSE = _mm_load_ps(gb_radius+j); - - raj_inv_SSE = gmx_mm_inv_ps(raj_SSE); - - /* Evaluate influence of atom aj -> ai */ - t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE); - t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE); - t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE); - t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE); - t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE); - t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE); - t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE); - t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE); - t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0); - t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1); - t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2); - t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3); - - obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1); - obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2); - obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3); - obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1); - obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2); - obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3); - obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1); - obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2); - obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3); - obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0); - obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1); - obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2); - obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3); - - uij_SSE0 = gmx_mm_inv_ps(t1_SSE0); - uij_SSE1 = gmx_mm_inv_ps(t1_SSE1); - uij_SSE2 = gmx_mm_inv_ps(t1_SSE2); - uij_SSE3 = gmx_mm_inv_ps(t1_SSE3); - lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)), - _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0)); - lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)), - _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1)); - lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)), - _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2)); - lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)), - _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3)); - dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1); - dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2); - dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3); - - uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1); - uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2); - uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3); - uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1); - uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2); - uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3); - lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1); - lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2); - lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3); - lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1); - lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2); - lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3); - - diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1); - diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2); - diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3); - lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1); - lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2); - lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3); - sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE); - sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1); - sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2); - sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3); - prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1); - prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2); - prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3); - - logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1)); - logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2)); - logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3)); - - t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1); - t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2); - t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3); - t2_SSE0 = _mm_mul_ps(diff2_SSE0, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_ps(diff2_SSE1, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t2_SSE2 = _mm_mul_ps(diff2_SSE2, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2), - prod_SSE2)); - t2_SSE3 = _mm_mul_ps(diff2_SSE3, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3), - prod_SSE3)); - - t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1)); - t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2)); - t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3)); - t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1)); - t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2)); - t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3)); - t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0)); - t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1)); - t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2)); - t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3)); - t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1); - t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2); - t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3); - t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1)); - t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2)); - t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3)); - - sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - - t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0), - _mm_mul_ps(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1), - _mm_mul_ps(prod_SSE1, lij3_SSE1)); - t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2), - _mm_mul_ps(prod_SSE2, lij3_SSE2)); - t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3), - _mm_mul_ps(prod_SSE3, lij3_SSE3)); - t1_SSE0 = _mm_sub_ps(t1_SSE0, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0), - _mm_mul_ps(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_ps(t1_SSE1, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1), - _mm_mul_ps(lij3_SSE1, dr_SSE1)))); - t1_SSE2 = _mm_sub_ps(t1_SSE2, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2), - _mm_mul_ps(lij3_SSE2, dr_SSE2)))); - t1_SSE3 = _mm_sub_ps(t1_SSE3, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3), - _mm_mul_ps(lij3_SSE3, dr_SSE3)))); - - t2_SSE0 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0), - _mm_mul_ps(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1), - _mm_mul_ps(uij3_SSE1, dr_SSE1))); - t2_SSE2 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2), - _mm_mul_ps(uij3_SSE2, dr_SSE2))); - t2_SSE3 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3), - _mm_mul_ps(uij3_SSE3, dr_SSE3))); - t2_SSE0 = _mm_sub_ps(t2_SSE0, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0), - _mm_mul_ps(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_ps(t2_SSE1, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1), - _mm_mul_ps(prod_SSE1, uij3_SSE1))); - t2_SSE2 = _mm_sub_ps(t2_SSE2, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2), - _mm_mul_ps(prod_SSE2, uij3_SSE2))); - t2_SSE3 = _mm_sub_ps(t2_SSE3, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3), - _mm_mul_ps(prod_SSE3, uij3_SSE3))); - t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0), - _mm_mul_ps(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1), - _mm_mul_ps(rinv_SSE1, rinv_SSE1)); - t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2), - _mm_mul_ps(rinv_SSE2, rinv_SSE2)); - t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3), - _mm_mul_ps(rinv_SSE3, rinv_SSE3)); - t3_SSE0 = _mm_sub_ps(t3_SSE0, - _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_ps(t3_SSE1, - _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1)))); - t3_SSE2 = _mm_sub_ps(t3_SSE2, - _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2)))); - t3_SSE3 = _mm_sub_ps(t3_SSE3, - _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3)))); - - t1_SSE0 = _mm_mul_ps(rinv_SSE0, - _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0), - _mm_add_ps(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_ps(rinv_SSE1, - _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1), - _mm_add_ps(t2_SSE1, t3_SSE1))); - t1_SSE2 = _mm_mul_ps(rinv_SSE2, - _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2), - _mm_add_ps(t2_SSE2, t3_SSE2))); - t1_SSE3 = _mm_mul_ps(rinv_SSE3, - _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3), - _mm_add_ps(t2_SSE3, t3_SSE3))); - - _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - dadx += 4; - - /* Evaluate influence of atom ai -> aj */ - t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0); - t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1); - t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2); - t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3); - t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0); - t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1); - t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2); - t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3); - t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0); - t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1); - t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2); - t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3); - - obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0); - obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1); - obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2); - obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3); - obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0); - obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1); - obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2); - obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3); - obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0); - obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1); - obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2); - obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3); - obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0); - obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1); - obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2); - obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3); - - uij_SSE0 = gmx_mm_inv_ps(t1_SSE0); - uij_SSE1 = gmx_mm_inv_ps(t1_SSE1); - uij_SSE2 = gmx_mm_inv_ps(t1_SSE2); - uij_SSE3 = gmx_mm_inv_ps(t1_SSE3); - lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)), - _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE)); - lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)), - _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE)); - lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)), - _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE)); - lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)), - _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE)); - dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0); - dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1); - dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2); - dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3); - - uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0); - uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1); - uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2); - uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3); - uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0); - uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1); - uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2); - uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3); - lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0); - lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1); - lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2); - lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3); - lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0); - lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1); - lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2); - lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3); - - diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0); - diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1); - diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2); - diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3); - lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0); - lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1); - lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2); - lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3); - sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0); - sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1); - sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2); - sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3); - prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0); - prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1); - prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2); - prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3); - - logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0)); - logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1)); - logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2)); - logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3)); - t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0); - t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1); - t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2); - t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3); - t2_SSE0 = _mm_mul_ps(diff2_SSE0, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0), - prod_SSE0)); - t2_SSE1 = _mm_mul_ps(diff2_SSE1, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1), - prod_SSE1)); - t2_SSE2 = _mm_mul_ps(diff2_SSE2, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2), - prod_SSE2)); - t2_SSE3 = _mm_mul_ps(diff2_SSE3, - _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3), - prod_SSE3)); - t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0)); - t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1)); - t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2)); - t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3)); - t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0)); - t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1)); - t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2)); - t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3)); - t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0)); - t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1)); - t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2)); - t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3)); - t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0); - t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1); - t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2); - t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3); - t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0)); - t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1)); - t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2)); - t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3)); - - _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j), - gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0), - _mm_and_ps(t1_SSE1, obc_mask1_SSE1), - _mm_and_ps(t1_SSE2, obc_mask1_SSE2), - _mm_and_ps(t1_SSE3, obc_mask1_SSE3)))); - - t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0), - _mm_mul_ps(prod_SSE0, lij3_SSE0)); - t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1), - _mm_mul_ps(prod_SSE1, lij3_SSE1)); - t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2), - _mm_mul_ps(prod_SSE2, lij3_SSE2)); - t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3), - _mm_mul_ps(prod_SSE3, lij3_SSE3)); - t1_SSE0 = _mm_sub_ps(t1_SSE0, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0), - _mm_mul_ps(lij3_SSE0, dr_SSE0)))); - t1_SSE1 = _mm_sub_ps(t1_SSE1, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1), - _mm_mul_ps(lij3_SSE1, dr_SSE1)))); - t1_SSE2 = _mm_sub_ps(t1_SSE2, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2), - _mm_mul_ps(lij3_SSE2, dr_SSE2)))); - t1_SSE3 = _mm_sub_ps(t1_SSE3, - _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3), - _mm_mul_ps(lij3_SSE3, dr_SSE3)))); - t2_SSE0 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0), - _mm_mul_ps(uij3_SSE0, dr_SSE0))); - t2_SSE1 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1), - _mm_mul_ps(uij3_SSE1, dr_SSE1))); - t2_SSE2 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2), - _mm_mul_ps(uij3_SSE2, dr_SSE2))); - t2_SSE3 = _mm_mul_ps(onefourth_SSE, - _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3), - _mm_mul_ps(uij3_SSE3, dr_SSE3))); - t2_SSE0 = _mm_sub_ps(t2_SSE0, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0), - _mm_mul_ps(prod_SSE0, uij3_SSE0))); - t2_SSE1 = _mm_sub_ps(t2_SSE1, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1), - _mm_mul_ps(prod_SSE1, uij3_SSE1))); - t2_SSE2 = _mm_sub_ps(t2_SSE2, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2), - _mm_mul_ps(prod_SSE2, uij3_SSE2))); - t2_SSE3 = _mm_sub_ps(t2_SSE3, - _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3), - _mm_mul_ps(prod_SSE3, uij3_SSE3))); - - t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0), - _mm_mul_ps(rinv_SSE0, rinv_SSE0)); - t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1), - _mm_mul_ps(rinv_SSE1, rinv_SSE1)); - t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2), - _mm_mul_ps(rinv_SSE2, rinv_SSE2)); - t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3), - _mm_mul_ps(rinv_SSE3, rinv_SSE3)); - - t3_SSE0 = _mm_sub_ps(t3_SSE0, - _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0)))); - t3_SSE1 = _mm_sub_ps(t3_SSE1, - _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1)))); - t3_SSE2 = _mm_sub_ps(t3_SSE2, - _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2)))); - t3_SSE3 = _mm_sub_ps(t3_SSE3, - _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE), - _mm_add_ps(one_SSE, - _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3)))); - - - t1_SSE0 = _mm_mul_ps(rinv_SSE0, - _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0), - _mm_add_ps(t2_SSE0, t3_SSE0))); - t1_SSE1 = _mm_mul_ps(rinv_SSE1, - _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1), - _mm_add_ps(t2_SSE1, t3_SSE1))); - t1_SSE2 = _mm_mul_ps(rinv_SSE2, - _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2), - _mm_add_ps(t2_SSE2, t3_SSE2))); - t1_SSE3 = _mm_mul_ps(rinv_SSE3, - _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3), - _mm_add_ps(t2_SSE3, t3_SSE3))); - - _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2)); - dadx += 4; - _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3)); - dadx += 4; - } - _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3); - sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1); - sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3); - sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2); - _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i))); - } - - - for (i = 0; i < natoms/2+1; i++) - { - work[i] += work[natoms+i]; - } - - /* Parallel summations would go here if ever implemented with DD */ - - if (gb_algorithm == egbHCT) - { - /* HCT */ - for (i = 0; i < natoms; i++) - { - if (born->use[i] != 0) - { - rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset; - sum_ai = 1.0/rai - work[i]; - min_rad = rai + born->gb_doffset; - rad = 1.0/sum_ai; - - born->bRad[i] = rad > min_rad ? rad : min_rad; - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - } - } - - } - else - { - /* OBC */ - - /* Calculate the radii */ - for (i = 0; i < natoms; i++) - { - - if (born->use[i] != 0) - { - rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]; - rai_inv2 = 1.0/rai; - rai = rai-born->gb_doffset; - rai_inv = 1.0/rai; - sum_ai = rai * work[i]; - sum_ai2 = sum_ai * sum_ai; - sum_ai3 = sum_ai2 * sum_ai; - - tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3); - born->bRad[i] = rai_inv - tsum*rai_inv2; - born->bRad[i] = 1.0 / born->bRad[i]; - - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - - tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2); - born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2; - } - } - } - - return 0; -} - - - - - - - - -int -genborn_allvsall_calc_chainrule_sse2_single(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - real * x, - real * f, - int gb_algorithm, - void * paadata) -{ - gmx_allvsallgb2_data_t *aadata; - int natoms; - int ni0, ni1; - int nj0, nj1, nj2, nj3; - int i, j, k, n; - int idx; - int * mask; - int * pmask0; - int * emask0; - int * jindex; - - real ix, iy, iz; - real fix, fiy, fiz; - real jx, jy, jz; - real dx, dy, dz; - real tx, ty, tz; - real rbai, rbaj, fgb, fgb_ai, rbi; - real * rb; - real * dadx; - real * x_align; - real * y_align; - real * z_align; - real * fx_align; - real * fy_align; - real * fz_align; - real tmpsum[4]; - - __m128 jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3; - __m128 ix_SSE0, iy_SSE0, iz_SSE0; - __m128 ix_SSE1, iy_SSE1, iz_SSE1; - __m128 ix_SSE2, iy_SSE2, iz_SSE2; - __m128 ix_SSE3, iy_SSE3, iz_SSE3; - __m128 fix_SSE0, fiy_SSE0, fiz_SSE0; - __m128 fix_SSE1, fiy_SSE1, fiz_SSE1; - __m128 fix_SSE2, fiy_SSE2, fiz_SSE2; - __m128 fix_SSE3, fiy_SSE3, fiz_SSE3; - __m128 rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3; - __m128 imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3; - __m128 jx_SSE, jy_SSE, jz_SSE, rbaj_SSE; - __m128 dx_SSE0, dy_SSE0, dz_SSE0; - __m128 dx_SSE1, dy_SSE1, dz_SSE1; - __m128 dx_SSE2, dy_SSE2, dz_SSE2; - __m128 dx_SSE3, dy_SSE3, dz_SSE3; - __m128 fgb_SSE0, fgb_ai_SSE0; - __m128 fgb_SSE1, fgb_ai_SSE1; - __m128 fgb_SSE2, fgb_ai_SSE2; - __m128 fgb_SSE3, fgb_ai_SSE3; - __m128 tx_SSE0, ty_SSE0, tz_SSE0; - __m128 tx_SSE1, ty_SSE1, tz_SSE1; - __m128 tx_SSE2, ty_SSE2, tz_SSE2; - __m128 tx_SSE3, ty_SSE3, tz_SSE3; - __m128 t1, t2; - - natoms = mdatoms->nr; - ni0 = 0; - ni1 = mdatoms->homenr; - dadx = fr->dadx; - - aadata = (gmx_allvsallgb2_data_t *)paadata; - - x_align = aadata->x_align; - y_align = aadata->y_align; - z_align = aadata->z_align; - fx_align = aadata->fx_align; - fy_align = aadata->fy_align; - fz_align = aadata->fz_align; - - jindex = aadata->jindex_gb; - dadx = fr->dadx; - - n = 0; - rb = aadata->work; - - /* Loop to get the proper form for the Born radius term */ - if (gb_algorithm == egbSTILL) - { - for (i = 0; i < natoms; i++) - { - rbi = born->bRad[i]; - rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0; - } - } - else if (gb_algorithm == egbHCT) - { - for (i = 0; i < natoms; i++) - { - rbi = born->bRad[i]; - rb[i] = rbi * rbi * fr->dvda[i]; - } - } - else if (gb_algorithm == egbOBC) - { - for (idx = 0; idx < natoms; idx++) - { - rbi = born->bRad[idx]; - rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx]; - } - } - - for (i = 0; i < 2*natoms; i++) - { - fx_align[i] = 0; - fy_align[i] = 0; - fz_align[i] = 0; - } - - - for (i = 0; i < natoms; i++) - { - rb[i+natoms] = rb[i]; - } - - for (i = ni0; i < ni1; i += UNROLLI) - { - /* We assume shifts are NOT used for all-vs-all interactions */ - - /* Load i atom data */ - ix_SSE0 = _mm_load1_ps(x_align+i); - iy_SSE0 = _mm_load1_ps(y_align+i); - iz_SSE0 = _mm_load1_ps(z_align+i); - ix_SSE1 = _mm_load1_ps(x_align+i+1); - iy_SSE1 = _mm_load1_ps(y_align+i+1); - iz_SSE1 = _mm_load1_ps(z_align+i+1); - ix_SSE2 = _mm_load1_ps(x_align+i+2); - iy_SSE2 = _mm_load1_ps(y_align+i+2); - iz_SSE2 = _mm_load1_ps(z_align+i+2); - ix_SSE3 = _mm_load1_ps(x_align+i+3); - iy_SSE3 = _mm_load1_ps(y_align+i+3); - iz_SSE3 = _mm_load1_ps(z_align+i+3); - - fix_SSE0 = _mm_setzero_ps(); - fiy_SSE0 = _mm_setzero_ps(); - fiz_SSE0 = _mm_setzero_ps(); - fix_SSE1 = _mm_setzero_ps(); - fiy_SSE1 = _mm_setzero_ps(); - fiz_SSE1 = _mm_setzero_ps(); - fix_SSE2 = _mm_setzero_ps(); - fiy_SSE2 = _mm_setzero_ps(); - fiz_SSE2 = _mm_setzero_ps(); - fix_SSE3 = _mm_setzero_ps(); - fiy_SSE3 = _mm_setzero_ps(); - fiz_SSE3 = _mm_setzero_ps(); - - rbai_SSE0 = _mm_load1_ps(rb+i); - rbai_SSE1 = _mm_load1_ps(rb+i+1); - rbai_SSE2 = _mm_load1_ps(rb+i+2); - rbai_SSE3 = _mm_load1_ps(rb+i+3); - - /* Load limits for loop over neighbors */ - nj0 = jindex[4*i]; - nj3 = jindex[4*i+3]; - - /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */ - for (j = nj0; j < nj3; j += UNROLLJ) - { - /* load j atom coordinates */ - jx_SSE = _mm_load_ps(x_align+j); - jy_SSE = _mm_load_ps(y_align+j); - jz_SSE = _mm_load_ps(z_align+j); - - /* Calculate distance */ - dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE); - dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE); - dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE); - dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE); - dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE); - dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE); - dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE); - dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE); - dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE); - dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE); - dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE); - dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE); - - rbaj_SSE = _mm_load_ps(rb+j); - - fgb_SSE0 = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx)); - dadx += 4; - fgb_SSE1 = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx)); - dadx += 4; - fgb_SSE2 = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx)); - dadx += 4; - fgb_SSE3 = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx)); - dadx += 4; - - fgb_ai_SSE0 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx)); - dadx += 4; - fgb_ai_SSE1 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx)); - dadx += 4; - fgb_ai_SSE2 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx)); - dadx += 4; - fgb_ai_SSE3 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx)); - dadx += 4; - - /* Total force between ai and aj is the sum of ai->aj and aj->ai */ - fgb_SSE0 = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0); - fgb_SSE1 = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1); - fgb_SSE2 = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2); - fgb_SSE3 = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3); - - /* Calculate temporary vectorial force */ - tx_SSE0 = _mm_mul_ps(fgb_SSE0, dx_SSE0); - ty_SSE0 = _mm_mul_ps(fgb_SSE0, dy_SSE0); - tz_SSE0 = _mm_mul_ps(fgb_SSE0, dz_SSE0); - tx_SSE1 = _mm_mul_ps(fgb_SSE1, dx_SSE1); - ty_SSE1 = _mm_mul_ps(fgb_SSE1, dy_SSE1); - tz_SSE1 = _mm_mul_ps(fgb_SSE1, dz_SSE1); - tx_SSE2 = _mm_mul_ps(fgb_SSE2, dx_SSE2); - ty_SSE2 = _mm_mul_ps(fgb_SSE2, dy_SSE2); - tz_SSE2 = _mm_mul_ps(fgb_SSE2, dz_SSE2); - tx_SSE3 = _mm_mul_ps(fgb_SSE3, dx_SSE3); - ty_SSE3 = _mm_mul_ps(fgb_SSE3, dy_SSE3); - tz_SSE3 = _mm_mul_ps(fgb_SSE3, dz_SSE3); - - /* Increment i atom force */ - fix_SSE0 = _mm_add_ps(fix_SSE0, tx_SSE0); - fiy_SSE0 = _mm_add_ps(fiy_SSE0, ty_SSE0); - fiz_SSE0 = _mm_add_ps(fiz_SSE0, tz_SSE0); - fix_SSE1 = _mm_add_ps(fix_SSE1, tx_SSE1); - fiy_SSE1 = _mm_add_ps(fiy_SSE1, ty_SSE1); - fiz_SSE1 = _mm_add_ps(fiz_SSE1, tz_SSE1); - fix_SSE2 = _mm_add_ps(fix_SSE2, tx_SSE2); - fiy_SSE2 = _mm_add_ps(fiy_SSE2, ty_SSE2); - fiz_SSE2 = _mm_add_ps(fiz_SSE2, tz_SSE2); - fix_SSE3 = _mm_add_ps(fix_SSE3, tx_SSE3); - fiy_SSE3 = _mm_add_ps(fiy_SSE3, ty_SSE3); - fiz_SSE3 = _mm_add_ps(fiz_SSE3, tz_SSE3); - - /* Decrement j atom force */ - _mm_store_ps(fx_align+j, - _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) )); - _mm_store_ps(fy_align+j, - _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) )); - _mm_store_ps(fz_align+j, - _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) )); - } - /* Add i forces to mem and shifted force list */ - _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3); - fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1); - fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3); - fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2); - _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i))); - - _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3); - fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1); - fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3); - fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2); - _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i))); - - _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3); - fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1); - fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3); - fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2); - _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i))); - } - - for (i = 0; i < natoms; i++) - { - f[3*i] += fx_align[i] + fx_align[natoms+i]; - f[3*i+1] += fy_align[i] + fy_align[natoms+i]; - f[3*i+2] += fz_align[i] + fz_align[natoms+i]; - } - - return 0; -} - -#else -/* dummy variable when not using SSE */ -int genborn_allvsall_sse2_single_dummy; - - -#endif diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_single.h b/src/gromacs/mdlib/genborn_allvsall_sse2_single.h deleted file mode 100644 index d1e908a985..0000000000 --- a/src/gromacs/mdlib/genborn_allvsall_sse2_single.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2009, The GROMACS Development Team. - * Copyright (c) 2010,2014, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#ifndef _GENBORN_ALLVSALL_SSE2_SINGLE_H -#define _GENBORN_ALLVSALL_SSE2_SINGLE_H - -#include "gromacs/legacyheaders/typedefs.h" -#include "gromacs/legacyheaders/types/simple.h" - -int -genborn_allvsall_calc_still_radii_sse2_single(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - gmx_localtop_t * top, - real * x, - t_commrec * cr, - void * work); - -int -genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - int gb_algorithm, - gmx_localtop_t * top, - real * x, - t_commrec * cr, - void * work); - -int -genborn_allvsall_calc_chainrule_sse2_single(t_forcerec * fr, - t_mdatoms * mdatoms, - gmx_genborn_t * born, - real * x, - real * f, - int gb_algorithm, - void * work); - -#endif diff --git a/src/gromacs/mdlib/genborn_sse2_double.c b/src/gromacs/mdlib/genborn_sse2_double.c deleted file mode 100644 index 62cab4b2f3..0000000000 --- a/src/gromacs/mdlib/genborn_sse2_double.c +++ /dev/null @@ -1,918 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2008, The GROMACS development team. - * Copyright (c) 2013,2014, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#include "gmxpre.h" - -#include -#include - -#include "gromacs/domdec/domdec.h" -#include "gromacs/fileio/pdbio.h" -#include "gromacs/legacyheaders/genborn.h" -#include "gromacs/legacyheaders/names.h" -#include "gromacs/legacyheaders/network.h" -#include "gromacs/legacyheaders/typedefs.h" -#include "gromacs/math/units.h" -#include "gromacs/math/vec.h" -#include "gromacs/utility/fatalerror.h" -#include "gromacs/utility/gmxmpi.h" -#include "gromacs/utility/smalloc.h" - -/* Only compile this file if SSE2 intrinsics are available */ -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) -#include "genborn_sse2_double.h" - -#include -#include - -int -calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr, - int natoms, gmx_localtop_t *top, - double *x, t_nblist *nl, - gmx_genborn_t *born) -{ - int i, k, n, ii, is3, ii3, nj0, nj1, offset; - int jnrA, jnrB, j3A, j3B; - int *mdtype; - double shX, shY, shZ; - int *jjnr; - double *shiftvec; - - double gpi_ai, gpi2; - double factor; - double *gb_radius; - double *vsolv; - double *work; - double *dadx; - - __m128d ix, iy, iz; - __m128d jx, jy, jz; - __m128d dx, dy, dz; - __m128d tx, ty, tz; - __m128d rsq, rinv, rinv2, rinv4, rinv6; - __m128d ratio, gpi, rai, raj, vai, vaj, rvdw; - __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp; - __m128d mask, icf4, icf6, mask_cmp; - - const __m128d half = _mm_set1_pd(0.5); - const __m128d three = _mm_set1_pd(3.0); - const __m128d one = _mm_set1_pd(1.0); - const __m128d two = _mm_set1_pd(2.0); - const __m128d zero = _mm_set1_pd(0.0); - const __m128d four = _mm_set1_pd(4.0); - - const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV); - const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5); - const __m128d still_p4 = _mm_set1_pd(STILL_P4); - - factor = 0.5 * ONE_4PI_EPS0; - - gb_radius = born->gb_radius; - vsolv = born->vsolv; - work = born->gpol_still_work; - jjnr = nl->jjnr; - shiftvec = fr->shift_vec[0]; - dadx = fr->dadx; - - jnrA = jnrB = 0; - jx = _mm_setzero_pd(); - jy = _mm_setzero_pd(); - jz = _mm_setzero_pd(); - - n = 0; - - for (i = 0; i < natoms; i++) - { - work[i] = 0; - } - - for (i = 0; i < nl->nri; i++) - { - ii = nl->iinr[i]; - ii3 = ii*3; - is3 = 3*nl->shift[i]; - shX = shiftvec[is3]; - shY = shiftvec[is3+1]; - shZ = shiftvec[is3+2]; - nj0 = nl->jindex[i]; - nj1 = nl->jindex[i+1]; - - ix = _mm_set1_pd(shX+x[ii3+0]); - iy = _mm_set1_pd(shY+x[ii3+1]); - iz = _mm_set1_pd(shZ+x[ii3+2]); - - - /* Polarization energy for atom ai */ - gpi = _mm_setzero_pd(); - - rai = _mm_load1_pd(gb_radius+ii); - prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]); - - for (k = nj0; k < nj1-1; k += 2) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - - j3A = 3*jnrA; - j3B = 3*jnrB; - - GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz); - - GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj); - GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj); - - dx = _mm_sub_pd(ix, jx); - dy = _mm_sub_pd(iy, jy); - dz = _mm_sub_pd(iz, jz); - - rsq = gmx_mm_calc_rsq_pd(dx, dy, dz); - rinv = gmx_mm_invsqrt_pd(rsq); - rinv2 = _mm_mul_pd(rinv, rinv); - rinv4 = _mm_mul_pd(rinv2, rinv2); - rinv6 = _mm_mul_pd(rinv4, rinv2); - - rvdw = _mm_add_pd(rai, raj); - ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw))); - - mask_cmp = _mm_cmple_pd(ratio, still_p5inv); - - /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ - if (0 == _mm_movemask_pd(mask_cmp) ) - { - /* if ratio>still_p5inv for ALL elements */ - ccf = one; - dccf = _mm_setzero_pd(); - } - else - { - ratio = _mm_min_pd(ratio, still_p5inv); - theta = _mm_mul_pd(ratio, still_pip5); - gmx_mm_sincos_pd(theta, &sinq, &cosq); - term = _mm_mul_pd(half, _mm_sub_pd(one, cosq)); - ccf = _mm_mul_pd(term, term); - dccf = _mm_mul_pd(_mm_mul_pd(two, term), - _mm_mul_pd(sinq, theta)); - } - - prod = _mm_mul_pd(still_p4, vaj); - icf4 = _mm_mul_pd(ccf, rinv4); - icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6); - - GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4)); - - gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) ); - - _mm_store_pd(dadx, _mm_mul_pd(prod, icf6)); - dadx += 2; - _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6)); - dadx += 2; - } - - if (k < nj1) - { - jnrA = jjnr[k]; - - j3A = 3*jnrA; - - GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz); - - GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj); - GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj); - - dx = _mm_sub_sd(ix, jx); - dy = _mm_sub_sd(iy, jy); - dz = _mm_sub_sd(iz, jz); - - rsq = gmx_mm_calc_rsq_pd(dx, dy, dz); - rinv = gmx_mm_invsqrt_pd(rsq); - rinv2 = _mm_mul_sd(rinv, rinv); - rinv4 = _mm_mul_sd(rinv2, rinv2); - rinv6 = _mm_mul_sd(rinv4, rinv2); - - rvdw = _mm_add_sd(rai, raj); - ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw))); - - mask_cmp = _mm_cmple_sd(ratio, still_p5inv); - - /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ - if (0 == _mm_movemask_pd(mask_cmp) ) - { - /* if ratio>still_p5inv for ALL elements */ - ccf = one; - dccf = _mm_setzero_pd(); - } - else - { - ratio = _mm_min_sd(ratio, still_p5inv); - theta = _mm_mul_sd(ratio, still_pip5); - gmx_mm_sincos_pd(theta, &sinq, &cosq); - term = _mm_mul_sd(half, _mm_sub_sd(one, cosq)); - ccf = _mm_mul_sd(term, term); - dccf = _mm_mul_sd(_mm_mul_sd(two, term), - _mm_mul_sd(sinq, theta)); - } - - prod = _mm_mul_sd(still_p4, vaj); - icf4 = _mm_mul_sd(ccf, rinv4); - icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6); - - GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4)); - - gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) ); - - _mm_store_pd(dadx, _mm_mul_pd(prod, icf6)); - dadx += 2; - _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6)); - dadx += 2; - } - gmx_mm_update_1pot_pd(gpi, work+ii); - } - - /* Sum up the polarization energy from other nodes */ - if (DOMAINDECOMP(cr)) - { - dd_atom_sum_real(cr->dd, work); - } - - /* Compute the radii */ - for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */ - { - if (born->use[i] != 0) - { - gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/ - gpi2 = gpi_ai * gpi_ai; - born->bRad[i] = factor*gmx_invsqrt(gpi2); - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - } - } - - /* Extra (local) communication required for DD */ - if (DOMAINDECOMP(cr)) - { - dd_atom_spread_real(cr->dd, born->bRad); - dd_atom_spread_real(cr->dd, fr->invsqrta); - } - - return 0; -} - - -int -calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top, - double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm) -{ - int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset; - int jnrA, jnrB; - int j3A, j3B; - double shX, shY, shZ; - double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr; - double sum_ai2, sum_ai3, tsum, tchain, doffset; - double *obc_param; - double *gb_radius; - double *work; - int * jjnr; - double *dadx; - double *shiftvec; - double min_rad, rad; - - __m128d ix, iy, iz, jx, jy, jz; - __m128d dx, dy, dz, t1, t2, t3, t4; - __m128d rsq, rinv, r; - __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij; - __m128d uij, lij2, uij2, lij3, uij3, diff2; - __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum; - __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv; - __m128d dadx1, dadx2; - __m128d logterm; - __m128d mask; - __m128d obc_mask1, obc_mask2, obc_mask3; - - __m128d oneeighth = _mm_set1_pd(0.125); - __m128d onefourth = _mm_set1_pd(0.25); - - const __m128d half = _mm_set1_pd(0.5); - const __m128d three = _mm_set1_pd(3.0); - const __m128d one = _mm_set1_pd(1.0); - const __m128d two = _mm_set1_pd(2.0); - const __m128d zero = _mm_set1_pd(0.0); - const __m128d neg = _mm_set1_pd(-1.0); - - /* Set the dielectric offset */ - doffset = born->gb_doffset; - gb_radius = born->gb_radius; - obc_param = born->param; - work = born->gpol_hct_work; - jjnr = nl->jjnr; - dadx = fr->dadx; - shiftvec = fr->shift_vec[0]; - - jx = _mm_setzero_pd(); - jy = _mm_setzero_pd(); - jz = _mm_setzero_pd(); - - jnrA = jnrB = 0; - - for (i = 0; i < born->nr; i++) - { - work[i] = 0; - } - - for (i = 0; i < nl->nri; i++) - { - ii = nl->iinr[i]; - ii3 = ii*3; - is3 = 3*nl->shift[i]; - shX = shiftvec[is3]; - shY = shiftvec[is3+1]; - shZ = shiftvec[is3+2]; - nj0 = nl->jindex[i]; - nj1 = nl->jindex[i+1]; - - ix = _mm_set1_pd(shX+x[ii3+0]); - iy = _mm_set1_pd(shY+x[ii3+1]); - iz = _mm_set1_pd(shZ+x[ii3+2]); - - rai = _mm_load1_pd(gb_radius+ii); - rai_inv = gmx_mm_inv_pd(rai); - - sum_ai = _mm_setzero_pd(); - - sk_ai = _mm_load1_pd(born->param+ii); - sk2_ai = _mm_mul_pd(sk_ai, sk_ai); - - for (k = nj0; k < nj1-1; k += 2) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - - j3A = 3*jnrA; - j3B = 3*jnrB; - - GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz); - GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj); - GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj); - - dx = _mm_sub_pd(ix, jx); - dy = _mm_sub_pd(iy, jy); - dz = _mm_sub_pd(iz, jz); - - rsq = gmx_mm_calc_rsq_pd(dx, dy, dz); - - rinv = gmx_mm_invsqrt_pd(rsq); - r = _mm_mul_pd(rsq, rinv); - - /* Compute raj_inv aj1-4 */ - raj_inv = gmx_mm_inv_pd(raj); - - /* Evaluate influence of atom aj -> ai */ - t1 = _mm_add_pd(r, sk_aj); - t2 = _mm_sub_pd(r, sk_aj); - t3 = _mm_sub_pd(sk_aj, r); - obc_mask1 = _mm_cmplt_pd(rai, t1); - obc_mask2 = _mm_cmplt_pd(rai, t2); - obc_mask3 = _mm_cmplt_pd(rai, t3); - - uij = gmx_mm_inv_pd(t1); - lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)), - _mm_andnot_pd(obc_mask2, rai_inv)); - dlij = _mm_and_pd(one, obc_mask2); - uij2 = _mm_mul_pd(uij, uij); - uij3 = _mm_mul_pd(uij2, uij); - lij2 = _mm_mul_pd(lij, lij); - lij3 = _mm_mul_pd(lij2, lij); - - diff2 = _mm_sub_pd(uij2, lij2); - lij_inv = gmx_mm_invsqrt_pd(lij2); - sk2_aj = _mm_mul_pd(sk_aj, sk_aj); - sk2_rinv = _mm_mul_pd(sk2_aj, rinv); - prod = _mm_mul_pd(onefourth, sk2_rinv); - - logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv)); - - t1 = _mm_sub_pd(lij, uij); - t2 = _mm_mul_pd(diff2, - _mm_sub_pd(_mm_mul_pd(onefourth, r), - prod)); - t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm)); - t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3)); - t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij)); - t4 = _mm_and_pd(t4, obc_mask3); - t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4)); - - sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) ); - - t1 = _mm_add_pd(_mm_mul_pd(half, lij2), - _mm_mul_pd(prod, lij3)); - t1 = _mm_sub_pd(t1, - _mm_mul_pd(onefourth, - _mm_add_pd(_mm_mul_pd(lij, rinv), - _mm_mul_pd(lij3, r)))); - t2 = _mm_mul_pd(onefourth, - _mm_add_pd(_mm_mul_pd(uij, rinv), - _mm_mul_pd(uij3, r))); - t2 = _mm_sub_pd(t2, - _mm_add_pd(_mm_mul_pd(half, uij2), - _mm_mul_pd(prod, uij3))); - t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm), - _mm_mul_pd(rinv, rinv)); - t3 = _mm_sub_pd(t3, - _mm_mul_pd(_mm_mul_pd(diff2, oneeighth), - _mm_add_pd(one, - _mm_mul_pd(sk2_rinv, rinv)))); - t1 = _mm_mul_pd(rinv, - _mm_add_pd(_mm_mul_pd(dlij, t1), - _mm_add_pd(t2, t3))); - - dadx1 = _mm_and_pd(t1, obc_mask1); - - /* Evaluate influence of atom ai -> aj */ - t1 = _mm_add_pd(r, sk_ai); - t2 = _mm_sub_pd(r, sk_ai); - t3 = _mm_sub_pd(sk_ai, r); - obc_mask1 = _mm_cmplt_pd(raj, t1); - obc_mask2 = _mm_cmplt_pd(raj, t2); - obc_mask3 = _mm_cmplt_pd(raj, t3); - - uij = gmx_mm_inv_pd(t1); - lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)), - _mm_andnot_pd(obc_mask2, raj_inv)); - dlij = _mm_and_pd(one, obc_mask2); - uij2 = _mm_mul_pd(uij, uij); - uij3 = _mm_mul_pd(uij2, uij); - lij2 = _mm_mul_pd(lij, lij); - lij3 = _mm_mul_pd(lij2, lij); - - diff2 = _mm_sub_pd(uij2, lij2); - lij_inv = gmx_mm_invsqrt_pd(lij2); - sk2_rinv = _mm_mul_pd(sk2_ai, rinv); - prod = _mm_mul_pd(onefourth, sk2_rinv); - - logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv)); - - t1 = _mm_sub_pd(lij, uij); - t2 = _mm_mul_pd(diff2, - _mm_sub_pd(_mm_mul_pd(onefourth, r), - prod)); - t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm)); - t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3)); - t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij)); - t4 = _mm_and_pd(t4, obc_mask3); - t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4)); - - GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1)); - - t1 = _mm_add_pd(_mm_mul_pd(half, lij2), - _mm_mul_pd(prod, lij3)); - t1 = _mm_sub_pd(t1, - _mm_mul_pd(onefourth, - _mm_add_pd(_mm_mul_pd(lij, rinv), - _mm_mul_pd(lij3, r)))); - t2 = _mm_mul_pd(onefourth, - _mm_add_pd(_mm_mul_pd(uij, rinv), - _mm_mul_pd(uij3, r))); - t2 = _mm_sub_pd(t2, - _mm_add_pd(_mm_mul_pd(half, uij2), - _mm_mul_pd(prod, uij3))); - t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm), - _mm_mul_pd(rinv, rinv)); - t3 = _mm_sub_pd(t3, - _mm_mul_pd(_mm_mul_pd(diff2, oneeighth), - _mm_add_pd(one, - _mm_mul_pd(sk2_rinv, rinv)))); - t1 = _mm_mul_pd(rinv, - _mm_add_pd(_mm_mul_pd(dlij, t1), - _mm_add_pd(t2, t3))); - - dadx2 = _mm_and_pd(t1, obc_mask1); - - _mm_store_pd(dadx, dadx1); - dadx += 2; - _mm_store_pd(dadx, dadx2); - dadx += 2; - } /* end normal inner loop */ - - if (k < nj1) - { - jnrA = jjnr[k]; - - j3A = 3*jnrA; - - GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz); - GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj); - GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj); - - dx = _mm_sub_sd(ix, jx); - dy = _mm_sub_sd(iy, jy); - dz = _mm_sub_sd(iz, jz); - - rsq = gmx_mm_calc_rsq_pd(dx, dy, dz); - - rinv = gmx_mm_invsqrt_pd(rsq); - r = _mm_mul_sd(rsq, rinv); - - /* Compute raj_inv aj1-4 */ - raj_inv = gmx_mm_inv_pd(raj); - - /* Evaluate influence of atom aj -> ai */ - t1 = _mm_add_sd(r, sk_aj); - t2 = _mm_sub_sd(r, sk_aj); - t3 = _mm_sub_sd(sk_aj, r); - obc_mask1 = _mm_cmplt_sd(rai, t1); - obc_mask2 = _mm_cmplt_sd(rai, t2); - obc_mask3 = _mm_cmplt_sd(rai, t3); - - uij = gmx_mm_inv_pd(t1); - lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)), - _mm_andnot_pd(obc_mask2, rai_inv)); - dlij = _mm_and_pd(one, obc_mask2); - uij2 = _mm_mul_sd(uij, uij); - uij3 = _mm_mul_sd(uij2, uij); - lij2 = _mm_mul_sd(lij, lij); - lij3 = _mm_mul_sd(lij2, lij); - - diff2 = _mm_sub_sd(uij2, lij2); - lij_inv = gmx_mm_invsqrt_pd(lij2); - sk2_aj = _mm_mul_sd(sk_aj, sk_aj); - sk2_rinv = _mm_mul_sd(sk2_aj, rinv); - prod = _mm_mul_sd(onefourth, sk2_rinv); - - logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv)); - - t1 = _mm_sub_sd(lij, uij); - t2 = _mm_mul_sd(diff2, - _mm_sub_sd(_mm_mul_pd(onefourth, r), - prod)); - t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm)); - t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3)); - t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij)); - t4 = _mm_and_pd(t4, obc_mask3); - t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4)); - - sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) ); - - t1 = _mm_add_sd(_mm_mul_sd(half, lij2), - _mm_mul_sd(prod, lij3)); - t1 = _mm_sub_sd(t1, - _mm_mul_sd(onefourth, - _mm_add_sd(_mm_mul_sd(lij, rinv), - _mm_mul_sd(lij3, r)))); - t2 = _mm_mul_sd(onefourth, - _mm_add_sd(_mm_mul_sd(uij, rinv), - _mm_mul_sd(uij3, r))); - t2 = _mm_sub_sd(t2, - _mm_add_sd(_mm_mul_sd(half, uij2), - _mm_mul_sd(prod, uij3))); - t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm), - _mm_mul_sd(rinv, rinv)); - t3 = _mm_sub_sd(t3, - _mm_mul_sd(_mm_mul_sd(diff2, oneeighth), - _mm_add_sd(one, - _mm_mul_sd(sk2_rinv, rinv)))); - t1 = _mm_mul_sd(rinv, - _mm_add_sd(_mm_mul_sd(dlij, t1), - _mm_add_pd(t2, t3))); - - dadx1 = _mm_and_pd(t1, obc_mask1); - - /* Evaluate influence of atom ai -> aj */ - t1 = _mm_add_sd(r, sk_ai); - t2 = _mm_sub_sd(r, sk_ai); - t3 = _mm_sub_sd(sk_ai, r); - obc_mask1 = _mm_cmplt_sd(raj, t1); - obc_mask2 = _mm_cmplt_sd(raj, t2); - obc_mask3 = _mm_cmplt_sd(raj, t3); - - uij = gmx_mm_inv_pd(t1); - lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)), - _mm_andnot_pd(obc_mask2, raj_inv)); - dlij = _mm_and_pd(one, obc_mask2); - uij2 = _mm_mul_sd(uij, uij); - uij3 = _mm_mul_sd(uij2, uij); - lij2 = _mm_mul_sd(lij, lij); - lij3 = _mm_mul_sd(lij2, lij); - - diff2 = _mm_sub_sd(uij2, lij2); - lij_inv = gmx_mm_invsqrt_pd(lij2); - sk2_rinv = _mm_mul_sd(sk2_ai, rinv); - prod = _mm_mul_sd(onefourth, sk2_rinv); - - logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv)); - - t1 = _mm_sub_sd(lij, uij); - t2 = _mm_mul_sd(diff2, - _mm_sub_sd(_mm_mul_sd(onefourth, r), - prod)); - t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm)); - t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3)); - t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij)); - t4 = _mm_and_pd(t4, obc_mask3); - t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4)); - - GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1)); - - t1 = _mm_add_sd(_mm_mul_sd(half, lij2), - _mm_mul_sd(prod, lij3)); - t1 = _mm_sub_sd(t1, - _mm_mul_sd(onefourth, - _mm_add_sd(_mm_mul_sd(lij, rinv), - _mm_mul_sd(lij3, r)))); - t2 = _mm_mul_sd(onefourth, - _mm_add_sd(_mm_mul_sd(uij, rinv), - _mm_mul_sd(uij3, r))); - t2 = _mm_sub_sd(t2, - _mm_add_sd(_mm_mul_sd(half, uij2), - _mm_mul_sd(prod, uij3))); - t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm), - _mm_mul_sd(rinv, rinv)); - t3 = _mm_sub_sd(t3, - _mm_mul_sd(_mm_mul_sd(diff2, oneeighth), - _mm_add_sd(one, - _mm_mul_sd(sk2_rinv, rinv)))); - t1 = _mm_mul_sd(rinv, - _mm_add_sd(_mm_mul_sd(dlij, t1), - _mm_add_sd(t2, t3))); - - dadx2 = _mm_and_pd(t1, obc_mask1); - - _mm_store_pd(dadx, dadx1); - dadx += 2; - _mm_store_pd(dadx, dadx2); - dadx += 2; - } - gmx_mm_update_1pot_pd(sum_ai, work+ii); - - } - - /* Parallel summations */ - if (DOMAINDECOMP(cr)) - { - dd_atom_sum_real(cr->dd, work); - } - - if (gb_algorithm == egbHCT) - { - /* HCT */ - for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */ - { - if (born->use[i] != 0) - { - rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset; - sum = 1.0/rr - work[i]; - min_rad = rr + doffset; - rad = 1.0/sum; - - born->bRad[i] = rad > min_rad ? rad : min_rad; - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - } - } - - /* Extra communication required for DD */ - if (DOMAINDECOMP(cr)) - { - dd_atom_spread_real(cr->dd, born->bRad); - dd_atom_spread_real(cr->dd, fr->invsqrta); - } - } - else - { - /* OBC */ - for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */ - { - if (born->use[i] != 0) - { - rr = top->atomtypes.gb_radius[md->typeA[i]]; - rr_inv2 = 1.0/rr; - rr = rr-doffset; - rr_inv = 1.0/rr; - sum = rr * work[i]; - sum2 = sum * sum; - sum3 = sum2 * sum; - - tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3); - born->bRad[i] = rr_inv - tsum*rr_inv2; - born->bRad[i] = 1.0 / born->bRad[i]; - - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - - tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2); - born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2; - } - } - /* Extra (local) communication required for DD */ - if (DOMAINDECOMP(cr)) - { - dd_atom_spread_real(cr->dd, born->bRad); - dd_atom_spread_real(cr->dd, fr->invsqrta); - dd_atom_spread_real(cr->dd, born->drobc); - } - } - - - - return 0; -} - - -int -calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, - double *x, double *f, double *fshift, double *shiftvec, - int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md) -{ - int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1; - int jnrA, jnrB; - int j3A, j3B; - int * jjnr; - - double rbi, shX, shY, shZ; - double *rb; - - __m128d ix, iy, iz; - __m128d jx, jy, jz; - __m128d fix, fiy, fiz; - __m128d dx, dy, dz; - __m128d tx, ty, tz; - - __m128d rbai, rbaj, f_gb, f_gb_ai; - __m128d xmm1, xmm2, xmm3; - - const __m128d two = _mm_set1_pd(2.0); - - rb = born->work; - - jjnr = nl->jjnr; - - /* Loop to get the proper form for the Born radius term, sse style */ - n0 = 0; - n1 = natoms; - - if (gb_algorithm == egbSTILL) - { - for (i = n0; i < n1; i++) - { - rbi = born->bRad[i]; - rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0; - } - } - else if (gb_algorithm == egbHCT) - { - for (i = n0; i < n1; i++) - { - rbi = born->bRad[i]; - rb[i] = rbi * rbi * dvda[i]; - } - } - else if (gb_algorithm == egbOBC) - { - for (i = n0; i < n1; i++) - { - rbi = born->bRad[i]; - rb[i] = rbi * rbi * born->drobc[i] * dvda[i]; - } - } - - jz = _mm_setzero_pd(); - - n = j3A = j3B = 0; - - for (i = 0; i < nl->nri; i++) - { - ii = nl->iinr[i]; - ii3 = ii*3; - is3 = 3*nl->shift[i]; - shX = shiftvec[is3]; - shY = shiftvec[is3+1]; - shZ = shiftvec[is3+2]; - nj0 = nl->jindex[i]; - nj1 = nl->jindex[i+1]; - - ix = _mm_set1_pd(shX+x[ii3+0]); - iy = _mm_set1_pd(shY+x[ii3+1]); - iz = _mm_set1_pd(shZ+x[ii3+2]); - - rbai = _mm_load1_pd(rb+ii); - fix = _mm_setzero_pd(); - fiy = _mm_setzero_pd(); - fiz = _mm_setzero_pd(); - - - for (k = nj0; k < nj1-1; k += 2) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - - j3A = 3*jnrA; - j3B = 3*jnrB; - - GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz); - - dx = _mm_sub_pd(ix, jx); - dy = _mm_sub_pd(iy, jy); - dz = _mm_sub_pd(iz, jz); - - GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj); - - /* load chain rule terms for j1-4 */ - f_gb = _mm_load_pd(dadx); - dadx += 2; - f_gb_ai = _mm_load_pd(dadx); - dadx += 2; - - /* calculate scalar force */ - f_gb = _mm_mul_pd(f_gb, rbai); - f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj); - f_gb = _mm_add_pd(f_gb, f_gb_ai); - - tx = _mm_mul_pd(f_gb, dx); - ty = _mm_mul_pd(f_gb, dy); - tz = _mm_mul_pd(f_gb, dz); - - fix = _mm_add_pd(fix, tx); - fiy = _mm_add_pd(fiy, ty); - fiz = _mm_add_pd(fiz, tz); - - GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz); - } - - /*deal with odd elements */ - if (k < nj1) - { - jnrA = jjnr[k]; - j3A = 3*jnrA; - - GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz); - - dx = _mm_sub_sd(ix, jx); - dy = _mm_sub_sd(iy, jy); - dz = _mm_sub_sd(iz, jz); - - GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj); - - /* load chain rule terms */ - f_gb = _mm_load_pd(dadx); - dadx += 2; - f_gb_ai = _mm_load_pd(dadx); - dadx += 2; - - /* calculate scalar force */ - f_gb = _mm_mul_sd(f_gb, rbai); - f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj); - f_gb = _mm_add_sd(f_gb, f_gb_ai); - - tx = _mm_mul_sd(f_gb, dx); - ty = _mm_mul_sd(f_gb, dy); - tz = _mm_mul_sd(f_gb, dz); - - fix = _mm_add_sd(fix, tx); - fiy = _mm_add_sd(fiy, ty); - fiz = _mm_add_sd(fiz, tz); - - GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz); - } - - /* fix/fiy/fiz now contain four partial force terms, that all should be - * added to the i particle forces and shift forces. - */ - gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3); - } - - return 0; -} - -#else -/* keep compiler happy */ -int genborn_sse2_dummy; - -#endif /* SSE2 intrinsics available */ diff --git a/src/gromacs/mdlib/genborn_sse2_double.h b/src/gromacs/mdlib/genborn_sse2_double.h deleted file mode 100644 index 0bf4ea9d69..0000000000 --- a/src/gromacs/mdlib/genborn_sse2_double.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2008, The GROMACS development team. - * Copyright (c) 2013,2014, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#ifndef _genborn_sse2_double_h -#define _genborn_sse2_double_h - -#include "gromacs/legacyheaders/typedefs.h" - -int -calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top, - double *x, t_nblist *nl, gmx_genborn_t *born); - -int -calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f, - double *fshift, double *shift_vec, int gb_algorithm, - gmx_genborn_t *born, t_mdatoms *md); - -int -calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top, - double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm); - -#endif /* _genborn_sse2_double_h */ diff --git a/src/gromacs/mdlib/genborn_sse2_single.c b/src/gromacs/mdlib/genborn_sse2_single.c deleted file mode 100644 index accbb6ef40..0000000000 --- a/src/gromacs/mdlib/genborn_sse2_single.c +++ /dev/null @@ -1,1510 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2008, The GROMACS development team. - * Copyright (c) 2013,2014, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#include "gmxpre.h" - -#include -#include - -#include "gromacs/domdec/domdec.h" -#include "gromacs/fileio/pdbio.h" -#include "gromacs/legacyheaders/genborn.h" -#include "gromacs/legacyheaders/names.h" -#include "gromacs/legacyheaders/network.h" -#include "gromacs/legacyheaders/typedefs.h" -#include "gromacs/math/units.h" -#include "gromacs/math/vec.h" -#include "gromacs/utility/fatalerror.h" -#include "gromacs/utility/gmxmpi.h" -#include "gromacs/utility/smalloc.h" - - -/* Only compile this file if SSE intrinsics are available */ -#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER) - -#include "genborn_sse2_single.h" - -#include -#include - - -int -calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr, - int natoms, gmx_localtop_t *top, - float *x, t_nblist *nl, - gmx_genborn_t *born) -{ - int i, k, n, ii, is3, ii3, nj0, nj1, offset; - int jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D; - int jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H; - int shift; - int *mdtype; - real shX, shY, shZ; - int *jjnr; - real *shiftvec; - - float gpi_ai, gpi2; - float factor; - float *gb_radius; - float *vsolv; - float *work; - float *dadx; - - __m128 ix, iy, iz; - __m128 jx, jy, jz; - __m128 dx, dy, dz; - __m128 tx, ty, tz; - __m128 jxB, jyB, jzB; - __m128 dxB, dyB, dzB; - __m128 txB, tyB, tzB; - __m128 rsq, rinv, rinv2, rinv4, rinv6; - __m128 rsqB, rinvB, rinv2B, rinv4B, rinv6B; - __m128 ratio, gpi, rai, raj, vai, vaj, rvdw; - __m128 ratioB, rajB, vajB, rvdwB; - __m128 ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp; - __m128 ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB; - __m128 mask, icf4, icf6, mask_cmp; - __m128 icf4B, icf6B, mask_cmpB; - - __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) ); - __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) ); - __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) ); - - const __m128 half = _mm_set1_ps(0.5f); - const __m128 three = _mm_set1_ps(3.0f); - const __m128 one = _mm_set1_ps(1.0f); - const __m128 two = _mm_set1_ps(2.0f); - const __m128 zero = _mm_set1_ps(0.0f); - const __m128 four = _mm_set1_ps(4.0f); - - const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV); - const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5); - const __m128 still_p4 = _mm_set1_ps(STILL_P4); - - factor = 0.5 * ONE_4PI_EPS0; - - gb_radius = born->gb_radius; - vsolv = born->vsolv; - work = born->gpol_still_work; - jjnr = nl->jjnr; - shiftvec = fr->shift_vec[0]; - dadx = fr->dadx; - - jnrA = jnrB = jnrC = jnrD = 0; - jx = _mm_setzero_ps(); - jy = _mm_setzero_ps(); - jz = _mm_setzero_ps(); - - n = 0; - - for (i = 0; i < natoms; i++) - { - work[i] = 0; - } - - for (i = 0; i < nl->nri; i++) - { - ii = nl->iinr[i]; - ii3 = ii*3; - is3 = 3*nl->shift[i]; - shX = shiftvec[is3]; - shY = shiftvec[is3+1]; - shZ = shiftvec[is3+2]; - nj0 = nl->jindex[i]; - nj1 = nl->jindex[i+1]; - - ix = _mm_set1_ps(shX+x[ii3+0]); - iy = _mm_set1_ps(shY+x[ii3+1]); - iz = _mm_set1_ps(shZ+x[ii3+2]); - - offset = (nj1-nj0)%4; - - /* Polarization energy for atom ai */ - gpi = _mm_setzero_ps(); - - rai = _mm_load1_ps(gb_radius+ii); - prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]); - - for (k = nj0; k < nj1-4-offset; k += 8) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - jnrC = jjnr[k+2]; - jnrD = jjnr[k+3]; - jnrE = jjnr[k+4]; - jnrF = jjnr[k+5]; - jnrG = jjnr[k+6]; - jnrH = jjnr[k+7]; - - j3A = 3*jnrA; - j3B = 3*jnrB; - j3C = 3*jnrC; - j3D = 3*jnrD; - j3E = 3*jnrE; - j3F = 3*jnrF; - j3G = 3*jnrG; - j3H = 3*jnrH; - - GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz); - GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB); - - GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj); - GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB); - GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj); - GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB); - - dx = _mm_sub_ps(ix, jx); - dy = _mm_sub_ps(iy, jy); - dz = _mm_sub_ps(iz, jz); - dxB = _mm_sub_ps(ix, jxB); - dyB = _mm_sub_ps(iy, jyB); - dzB = _mm_sub_ps(iz, jzB); - - rsq = gmx_mm_calc_rsq_ps(dx, dy, dz); - rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB); - rinv = gmx_mm_invsqrt_ps(rsq); - rinvB = gmx_mm_invsqrt_ps(rsqB); - rinv2 = _mm_mul_ps(rinv, rinv); - rinv2B = _mm_mul_ps(rinvB, rinvB); - rinv4 = _mm_mul_ps(rinv2, rinv2); - rinv4B = _mm_mul_ps(rinv2B, rinv2B); - rinv6 = _mm_mul_ps(rinv4, rinv2); - rinv6B = _mm_mul_ps(rinv4B, rinv2B); - - rvdw = _mm_add_ps(rai, raj); - rvdwB = _mm_add_ps(rai, rajB); - ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw))); - ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB))); - - mask_cmp = _mm_cmple_ps(ratio, still_p5inv); - mask_cmpB = _mm_cmple_ps(ratioB, still_p5inv); - - /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */ - if (0 == _mm_movemask_ps(mask_cmp) ) - { - /* if ratio>still_p5inv for ALL elements */ - ccf = one; - dccf = _mm_setzero_ps(); - } - else - { - ratio = _mm_min_ps(ratio, still_p5inv); - theta = _mm_mul_ps(ratio, still_pip5); - gmx_mm_sincos_ps(theta, &sinq, &cosq); - term = _mm_mul_ps(half, _mm_sub_ps(one, cosq)); - ccf = _mm_mul_ps(term, term); - dccf = _mm_mul_ps(_mm_mul_ps(two, term), - _mm_mul_ps(sinq, theta)); - } - if (0 == _mm_movemask_ps(mask_cmpB) ) - { - /* if ratio>still_p5inv for ALL elements */ - ccfB = one; - dccfB = _mm_setzero_ps(); - } - else - { - ratioB = _mm_min_ps(ratioB, still_p5inv); - thetaB = _mm_mul_ps(ratioB, still_pip5); - gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB); - termB = _mm_mul_ps(half, _mm_sub_ps(one, cosqB)); - ccfB = _mm_mul_ps(termB, termB); - dccfB = _mm_mul_ps(_mm_mul_ps(two, termB), - _mm_mul_ps(sinqB, thetaB)); - } - - prod = _mm_mul_ps(still_p4, vaj); - prodB = _mm_mul_ps(still_p4, vajB); - icf4 = _mm_mul_ps(ccf, rinv4); - icf4B = _mm_mul_ps(ccfB, rinv4B); - icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6); - icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B); - - GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4)); - GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B)); - - gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) ); - - _mm_store_ps(dadx, _mm_mul_ps(prod, icf6)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B)); - dadx += 4; - } - - for (; k < nj1-offset; k += 4) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - jnrC = jjnr[k+2]; - jnrD = jjnr[k+3]; - - j3A = 3*jnrA; - j3B = 3*jnrB; - j3C = 3*jnrC; - j3D = 3*jnrD; - - GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz); - - GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj); - GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj); - - dx = _mm_sub_ps(ix, jx); - dy = _mm_sub_ps(iy, jy); - dz = _mm_sub_ps(iz, jz); - - rsq = gmx_mm_calc_rsq_ps(dx, dy, dz); - rinv = gmx_mm_invsqrt_ps(rsq); - rinv2 = _mm_mul_ps(rinv, rinv); - rinv4 = _mm_mul_ps(rinv2, rinv2); - rinv6 = _mm_mul_ps(rinv4, rinv2); - - rvdw = _mm_add_ps(rai, raj); - ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw))); - - mask_cmp = _mm_cmple_ps(ratio, still_p5inv); - - /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */ - if (0 == _mm_movemask_ps(mask_cmp)) - { - /* if ratio>still_p5inv for ALL elements */ - ccf = one; - dccf = _mm_setzero_ps(); - } - else - { - ratio = _mm_min_ps(ratio, still_p5inv); - theta = _mm_mul_ps(ratio, still_pip5); - gmx_mm_sincos_ps(theta, &sinq, &cosq); - term = _mm_mul_ps(half, _mm_sub_ps(one, cosq)); - ccf = _mm_mul_ps(term, term); - dccf = _mm_mul_ps(_mm_mul_ps(two, term), - _mm_mul_ps(sinq, theta)); - } - - prod = _mm_mul_ps(still_p4, vaj); - icf4 = _mm_mul_ps(ccf, rinv4); - icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6); - - GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4)); - - gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4)); - - _mm_store_ps(dadx, _mm_mul_ps(prod, icf6)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6)); - dadx += 4; - } - - if (offset != 0) - { - if (offset == 1) - { - jnrA = jjnr[k]; - j3A = 3*jnrA; - GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz); - GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj); - GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj); - mask = mask1; - } - else if (offset == 2) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - j3A = 3*jnrA; - j3B = 3*jnrB; - GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz); - GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj); - GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj); - mask = mask2; - } - else - { - /* offset must be 3 */ - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - jnrC = jjnr[k+2]; - j3A = 3*jnrA; - j3B = 3*jnrB; - j3C = 3*jnrC; - GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz); - GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj); - GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj); - mask = mask3; - } - - dx = _mm_sub_ps(ix, jx); - dy = _mm_sub_ps(iy, jy); - dz = _mm_sub_ps(iz, jz); - - rsq = gmx_mm_calc_rsq_ps(dx, dy, dz); - rinv = gmx_mm_invsqrt_ps(rsq); - rinv2 = _mm_mul_ps(rinv, rinv); - rinv4 = _mm_mul_ps(rinv2, rinv2); - rinv6 = _mm_mul_ps(rinv4, rinv2); - - rvdw = _mm_add_ps(rai, raj); - ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw))); - - mask_cmp = _mm_cmple_ps(ratio, still_p5inv); - - if (0 == _mm_movemask_ps(mask_cmp)) - { - /* if ratio>still_p5inv for ALL elements */ - ccf = one; - dccf = _mm_setzero_ps(); - } - else - { - ratio = _mm_min_ps(ratio, still_p5inv); - theta = _mm_mul_ps(ratio, still_pip5); - gmx_mm_sincos_ps(theta, &sinq, &cosq); - term = _mm_mul_ps(half, _mm_sub_ps(one, cosq)); - ccf = _mm_mul_ps(term, term); - dccf = _mm_mul_ps(_mm_mul_ps(two, term), - _mm_mul_ps(sinq, theta)); - } - - prod = _mm_mul_ps(still_p4, vaj); - icf4 = _mm_mul_ps(ccf, rinv4); - icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6); - - gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4)); - - _mm_store_ps(dadx, _mm_mul_ps(prod, icf6)); - dadx += 4; - _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6)); - dadx += 4; - - tmp = _mm_mul_ps(prod_ai, icf4); - - if (offset == 1) - { - GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp); - } - else if (offset == 2) - { - GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp); - } - else - { - /* offset must be 3 */ - GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp); - } - } - GMX_MM_UPDATE_1POT_PS(gpi, work+ii); - } - - /* Sum up the polarization energy from other nodes */ - if (DOMAINDECOMP(cr)) - { - dd_atom_sum_real(cr->dd, work); - } - - /* Compute the radii */ - for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */ - { - if (born->use[i] != 0) - { - gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/ - gpi2 = gpi_ai * gpi_ai; - born->bRad[i] = factor*gmx_invsqrt(gpi2); - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - } - } - - /* Extra (local) communication required for DD */ - if (DOMAINDECOMP(cr)) - { - dd_atom_spread_real(cr->dd, born->bRad); - dd_atom_spread_real(cr->dd, fr->invsqrta); - } - - return 0; -} - - -int -calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top, - float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm) -{ - int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset; - int jnrA, jnrB, jnrC, jnrD; - int j3A, j3B, j3C, j3D; - int jnrE, jnrF, jnrG, jnrH; - int j3E, j3F, j3G, j3H; - float shX, shY, shZ; - float rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr; - float sum_ai2, sum_ai3, tsum, tchain, doffset; - float *obc_param; - float *gb_radius; - float *work; - int * jjnr; - float *dadx; - float *shiftvec; - float min_rad, rad; - - __m128 ix, iy, iz, jx, jy, jz; - __m128 dx, dy, dz, t1, t2, t3, t4; - __m128 rsq, rinv, r; - __m128 rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij; - __m128 uij, lij2, uij2, lij3, uij3, diff2; - __m128 lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum; - __m128 sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv; - __m128 dadx1, dadx2; - __m128 logterm; - __m128 mask; - __m128 obc_mask1, obc_mask2, obc_mask3; - __m128 jxB, jyB, jzB, t1B, t2B, t3B, t4B; - __m128 dxB, dyB, dzB, rsqB, rinvB, rB; - __m128 rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB; - __m128 uijB, lij2B, uij2B, lij3B, uij3B, diff2B; - __m128 lij_invB, sk2_invB, prodB; - __m128 sk_ajB, sk2_ajB, sk2_rinvB; - __m128 dadx1B, dadx2B; - __m128 logtermB; - __m128 obc_mask1B, obc_mask2B, obc_mask3B; - - __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) ); - __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) ); - __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) ); - - __m128 oneeighth = _mm_set1_ps(0.125); - __m128 onefourth = _mm_set1_ps(0.25); - - const __m128 half = _mm_set1_ps(0.5f); - const __m128 three = _mm_set1_ps(3.0f); - const __m128 one = _mm_set1_ps(1.0f); - const __m128 two = _mm_set1_ps(2.0f); - const __m128 zero = _mm_set1_ps(0.0f); - const __m128 neg = _mm_set1_ps(-1.0f); - - /* Set the dielectric offset */ - doffset = born->gb_doffset; - gb_radius = born->gb_radius; - obc_param = born->param; - work = born->gpol_hct_work; - jjnr = nl->jjnr; - dadx = fr->dadx; - shiftvec = fr->shift_vec[0]; - - jx = _mm_setzero_ps(); - jy = _mm_setzero_ps(); - jz = _mm_setzero_ps(); - - jnrA = jnrB = jnrC = jnrD = 0; - - for (i = 0; i < born->nr; i++) - { - work[i] = 0; - } - - for (i = 0; i < nl->nri; i++) - { - ii = nl->iinr[i]; - ii3 = ii*3; - is3 = 3*nl->shift[i]; - shX = shiftvec[is3]; - shY = shiftvec[is3+1]; - shZ = shiftvec[is3+2]; - nj0 = nl->jindex[i]; - nj1 = nl->jindex[i+1]; - - ix = _mm_set1_ps(shX+x[ii3+0]); - iy = _mm_set1_ps(shY+x[ii3+1]); - iz = _mm_set1_ps(shZ+x[ii3+2]); - - offset = (nj1-nj0)%4; - - rai = _mm_load1_ps(gb_radius+ii); - rai_inv = gmx_mm_inv_ps(rai); - - sum_ai = _mm_setzero_ps(); - - sk_ai = _mm_load1_ps(born->param+ii); - sk2_ai = _mm_mul_ps(sk_ai, sk_ai); - - for (k = nj0; k < nj1-4-offset; k += 8) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - jnrC = jjnr[k+2]; - jnrD = jjnr[k+3]; - jnrE = jjnr[k+4]; - jnrF = jjnr[k+5]; - jnrG = jjnr[k+6]; - jnrH = jjnr[k+7]; - - j3A = 3*jnrA; - j3B = 3*jnrB; - j3C = 3*jnrC; - j3D = 3*jnrD; - j3E = 3*jnrE; - j3F = 3*jnrF; - j3G = 3*jnrG; - j3H = 3*jnrH; - - GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz); - GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB); - GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj); - GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB); - GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj); - GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB); - - dx = _mm_sub_ps(ix, jx); - dy = _mm_sub_ps(iy, jy); - dz = _mm_sub_ps(iz, jz); - dxB = _mm_sub_ps(ix, jxB); - dyB = _mm_sub_ps(iy, jyB); - dzB = _mm_sub_ps(iz, jzB); - - rsq = gmx_mm_calc_rsq_ps(dx, dy, dz); - rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB); - - rinv = gmx_mm_invsqrt_ps(rsq); - r = _mm_mul_ps(rsq, rinv); - rinvB = gmx_mm_invsqrt_ps(rsqB); - rB = _mm_mul_ps(rsqB, rinvB); - - /* Compute raj_inv aj1-4 */ - raj_inv = gmx_mm_inv_ps(raj); - raj_invB = gmx_mm_inv_ps(rajB); - - /* Evaluate influence of atom aj -> ai */ - t1 = _mm_add_ps(r, sk_aj); - t2 = _mm_sub_ps(r, sk_aj); - t3 = _mm_sub_ps(sk_aj, r); - t1B = _mm_add_ps(rB, sk_ajB); - t2B = _mm_sub_ps(rB, sk_ajB); - t3B = _mm_sub_ps(sk_ajB, rB); - obc_mask1 = _mm_cmplt_ps(rai, t1); - obc_mask2 = _mm_cmplt_ps(rai, t2); - obc_mask3 = _mm_cmplt_ps(rai, t3); - obc_mask1B = _mm_cmplt_ps(rai, t1B); - obc_mask2B = _mm_cmplt_ps(rai, t2B); - obc_mask3B = _mm_cmplt_ps(rai, t3B); - - uij = gmx_mm_inv_ps(t1); - lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)), - _mm_andnot_ps(obc_mask2, rai_inv)); - dlij = _mm_and_ps(one, obc_mask2); - uij2 = _mm_mul_ps(uij, uij); - uij3 = _mm_mul_ps(uij2, uij); - lij2 = _mm_mul_ps(lij, lij); - lij3 = _mm_mul_ps(lij2, lij); - - uijB = gmx_mm_inv_ps(t1B); - lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)), - _mm_andnot_ps(obc_mask2B, rai_inv)); - dlijB = _mm_and_ps(one, obc_mask2B); - uij2B = _mm_mul_ps(uijB, uijB); - uij3B = _mm_mul_ps(uij2B, uijB); - lij2B = _mm_mul_ps(lijB, lijB); - lij3B = _mm_mul_ps(lij2B, lijB); - - diff2 = _mm_sub_ps(uij2, lij2); - lij_inv = gmx_mm_invsqrt_ps(lij2); - sk2_aj = _mm_mul_ps(sk_aj, sk_aj); - sk2_rinv = _mm_mul_ps(sk2_aj, rinv); - prod = _mm_mul_ps(onefourth, sk2_rinv); - - diff2B = _mm_sub_ps(uij2B, lij2B); - lij_invB = gmx_mm_invsqrt_ps(lij2B); - sk2_ajB = _mm_mul_ps(sk_ajB, sk_ajB); - sk2_rinvB = _mm_mul_ps(sk2_ajB, rinvB); - prodB = _mm_mul_ps(onefourth, sk2_rinvB); - - logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv)); - logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB)); - - t1 = _mm_sub_ps(lij, uij); - t2 = _mm_mul_ps(diff2, - _mm_sub_ps(_mm_mul_ps(onefourth, r), - prod)); - t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm)); - t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3)); - t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij)); - t4 = _mm_and_ps(t4, obc_mask3); - t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4)); - - t1B = _mm_sub_ps(lijB, uijB); - t2B = _mm_mul_ps(diff2B, - _mm_sub_ps(_mm_mul_ps(onefourth, rB), - prodB)); - t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB)); - t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B)); - t4B = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB)); - t4B = _mm_and_ps(t4B, obc_mask3B); - t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B)); - - sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) )); - - t1 = _mm_add_ps(_mm_mul_ps(half, lij2), - _mm_mul_ps(prod, lij3)); - t1 = _mm_sub_ps(t1, - _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(lij, rinv), - _mm_mul_ps(lij3, r)))); - t2 = _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(uij, rinv), - _mm_mul_ps(uij3, r))); - t2 = _mm_sub_ps(t2, - _mm_add_ps(_mm_mul_ps(half, uij2), - _mm_mul_ps(prod, uij3))); - t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm), - _mm_mul_ps(rinv, rinv)); - t3 = _mm_sub_ps(t3, - _mm_mul_ps(_mm_mul_ps(diff2, oneeighth), - _mm_add_ps(one, - _mm_mul_ps(sk2_rinv, rinv)))); - t1 = _mm_mul_ps(rinv, - _mm_add_ps(_mm_mul_ps(dlij, t1), - _mm_add_ps(t2, t3))); - - - - t1B = _mm_add_ps(_mm_mul_ps(half, lij2B), - _mm_mul_ps(prodB, lij3B)); - t1B = _mm_sub_ps(t1B, - _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(lijB, rinvB), - _mm_mul_ps(lij3B, rB)))); - t2B = _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(uijB, rinvB), - _mm_mul_ps(uij3B, rB))); - t2B = _mm_sub_ps(t2B, - _mm_add_ps(_mm_mul_ps(half, uij2B), - _mm_mul_ps(prodB, uij3B))); - t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB), - _mm_mul_ps(rinvB, rinvB)); - t3B = _mm_sub_ps(t3B, - _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth), - _mm_add_ps(one, - _mm_mul_ps(sk2_rinvB, rinvB)))); - t1B = _mm_mul_ps(rinvB, - _mm_add_ps(_mm_mul_ps(dlijB, t1B), - _mm_add_ps(t2B, t3B))); - - dadx1 = _mm_and_ps(t1, obc_mask1); - dadx1B = _mm_and_ps(t1B, obc_mask1B); - - - /* Evaluate influence of atom ai -> aj */ - t1 = _mm_add_ps(r, sk_ai); - t2 = _mm_sub_ps(r, sk_ai); - t3 = _mm_sub_ps(sk_ai, r); - t1B = _mm_add_ps(rB, sk_ai); - t2B = _mm_sub_ps(rB, sk_ai); - t3B = _mm_sub_ps(sk_ai, rB); - obc_mask1 = _mm_cmplt_ps(raj, t1); - obc_mask2 = _mm_cmplt_ps(raj, t2); - obc_mask3 = _mm_cmplt_ps(raj, t3); - obc_mask1B = _mm_cmplt_ps(rajB, t1B); - obc_mask2B = _mm_cmplt_ps(rajB, t2B); - obc_mask3B = _mm_cmplt_ps(rajB, t3B); - - uij = gmx_mm_inv_ps(t1); - lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)), - _mm_andnot_ps(obc_mask2, raj_inv)); - dlij = _mm_and_ps(one, obc_mask2); - uij2 = _mm_mul_ps(uij, uij); - uij3 = _mm_mul_ps(uij2, uij); - lij2 = _mm_mul_ps(lij, lij); - lij3 = _mm_mul_ps(lij2, lij); - - uijB = gmx_mm_inv_ps(t1B); - lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)), - _mm_andnot_ps(obc_mask2B, raj_invB)); - dlijB = _mm_and_ps(one, obc_mask2B); - uij2B = _mm_mul_ps(uijB, uijB); - uij3B = _mm_mul_ps(uij2B, uijB); - lij2B = _mm_mul_ps(lijB, lijB); - lij3B = _mm_mul_ps(lij2B, lijB); - - diff2 = _mm_sub_ps(uij2, lij2); - lij_inv = gmx_mm_invsqrt_ps(lij2); - sk2_rinv = _mm_mul_ps(sk2_ai, rinv); - prod = _mm_mul_ps(onefourth, sk2_rinv); - - diff2B = _mm_sub_ps(uij2B, lij2B); - lij_invB = gmx_mm_invsqrt_ps(lij2B); - sk2_rinvB = _mm_mul_ps(sk2_ai, rinvB); - prodB = _mm_mul_ps(onefourth, sk2_rinvB); - - logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv)); - logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB)); - - t1 = _mm_sub_ps(lij, uij); - t2 = _mm_mul_ps(diff2, - _mm_sub_ps(_mm_mul_ps(onefourth, r), - prod)); - t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm)); - t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3)); - t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij)); - t4 = _mm_and_ps(t4, obc_mask3); - t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4)); - - t1B = _mm_sub_ps(lijB, uijB); - t2B = _mm_mul_ps(diff2B, - _mm_sub_ps(_mm_mul_ps(onefourth, rB), - prodB)); - t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB)); - t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B)); - t4B = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB)); - t4B = _mm_and_ps(t4B, obc_mask3B); - t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B)); - - GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1)); - GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B)); - - t1 = _mm_add_ps(_mm_mul_ps(half, lij2), - _mm_mul_ps(prod, lij3)); - t1 = _mm_sub_ps(t1, - _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(lij, rinv), - _mm_mul_ps(lij3, r)))); - t2 = _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(uij, rinv), - _mm_mul_ps(uij3, r))); - t2 = _mm_sub_ps(t2, - _mm_add_ps(_mm_mul_ps(half, uij2), - _mm_mul_ps(prod, uij3))); - t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm), - _mm_mul_ps(rinv, rinv)); - t3 = _mm_sub_ps(t3, - _mm_mul_ps(_mm_mul_ps(diff2, oneeighth), - _mm_add_ps(one, - _mm_mul_ps(sk2_rinv, rinv)))); - t1 = _mm_mul_ps(rinv, - _mm_add_ps(_mm_mul_ps(dlij, t1), - _mm_add_ps(t2, t3))); - - - t1B = _mm_add_ps(_mm_mul_ps(half, lij2B), - _mm_mul_ps(prodB, lij3B)); - t1B = _mm_sub_ps(t1B, - _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(lijB, rinvB), - _mm_mul_ps(lij3B, rB)))); - t2B = _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(uijB, rinvB), - _mm_mul_ps(uij3B, rB))); - t2B = _mm_sub_ps(t2B, - _mm_add_ps(_mm_mul_ps(half, uij2B), - _mm_mul_ps(prodB, uij3B))); - t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB), - _mm_mul_ps(rinvB, rinvB)); - t3B = _mm_sub_ps(t3B, - _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth), - _mm_add_ps(one, - _mm_mul_ps(sk2_rinvB, rinvB)))); - t1B = _mm_mul_ps(rinvB, - _mm_add_ps(_mm_mul_ps(dlijB, t1B), - _mm_add_ps(t2B, t3B))); - - - dadx2 = _mm_and_ps(t1, obc_mask1); - dadx2B = _mm_and_ps(t1B, obc_mask1B); - - _mm_store_ps(dadx, dadx1); - dadx += 4; - _mm_store_ps(dadx, dadx2); - dadx += 4; - _mm_store_ps(dadx, dadx1B); - dadx += 4; - _mm_store_ps(dadx, dadx2B); - dadx += 4; - - } /* end normal inner loop */ - - for (; k < nj1-offset; k += 4) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - jnrC = jjnr[k+2]; - jnrD = jjnr[k+3]; - - j3A = 3*jnrA; - j3B = 3*jnrB; - j3C = 3*jnrC; - j3D = 3*jnrD; - - GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz); - GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj); - GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj); - - dx = _mm_sub_ps(ix, jx); - dy = _mm_sub_ps(iy, jy); - dz = _mm_sub_ps(iz, jz); - - rsq = gmx_mm_calc_rsq_ps(dx, dy, dz); - - rinv = gmx_mm_invsqrt_ps(rsq); - r = _mm_mul_ps(rsq, rinv); - - /* Compute raj_inv aj1-4 */ - raj_inv = gmx_mm_inv_ps(raj); - - /* Evaluate influence of atom aj -> ai */ - t1 = _mm_add_ps(r, sk_aj); - obc_mask1 = _mm_cmplt_ps(rai, t1); - - if (_mm_movemask_ps(obc_mask1)) - { - /* If any of the elements has rai aj */ - t1 = _mm_add_ps(r, sk_ai); - obc_mask1 = _mm_cmplt_ps(raj, t1); - - if (_mm_movemask_ps(obc_mask1)) - { - t2 = _mm_sub_ps(r, sk_ai); - t3 = _mm_sub_ps(sk_ai, r); - obc_mask2 = _mm_cmplt_ps(raj, t2); - obc_mask3 = _mm_cmplt_ps(raj, t3); - - uij = gmx_mm_inv_ps(t1); - lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)), - _mm_andnot_ps(obc_mask2, raj_inv)); - dlij = _mm_and_ps(one, obc_mask2); - uij2 = _mm_mul_ps(uij, uij); - uij3 = _mm_mul_ps(uij2, uij); - lij2 = _mm_mul_ps(lij, lij); - lij3 = _mm_mul_ps(lij2, lij); - diff2 = _mm_sub_ps(uij2, lij2); - lij_inv = gmx_mm_invsqrt_ps(lij2); - sk2_rinv = _mm_mul_ps(sk2_ai, rinv); - prod = _mm_mul_ps(onefourth, sk2_rinv); - logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv)); - t1 = _mm_sub_ps(lij, uij); - t2 = _mm_mul_ps(diff2, - _mm_sub_ps(_mm_mul_ps(onefourth, r), - prod)); - t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm)); - t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3)); - t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij)); - t4 = _mm_and_ps(t4, obc_mask3); - t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4)); - - GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1)); - - t1 = _mm_add_ps(_mm_mul_ps(half, lij2), - _mm_mul_ps(prod, lij3)); - t1 = _mm_sub_ps(t1, - _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(lij, rinv), - _mm_mul_ps(lij3, r)))); - t2 = _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(uij, rinv), - _mm_mul_ps(uij3, r))); - t2 = _mm_sub_ps(t2, - _mm_add_ps(_mm_mul_ps(half, uij2), - _mm_mul_ps(prod, uij3))); - t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm), - _mm_mul_ps(rinv, rinv)); - t3 = _mm_sub_ps(t3, - _mm_mul_ps(_mm_mul_ps(diff2, oneeighth), - _mm_add_ps(one, - _mm_mul_ps(sk2_rinv, rinv)))); - t1 = _mm_mul_ps(rinv, - _mm_add_ps(_mm_mul_ps(dlij, t1), - _mm_add_ps(t2, t3))); - dadx2 = _mm_and_ps(t1, obc_mask1); - } - else - { - dadx2 = _mm_setzero_ps(); - } - - _mm_store_ps(dadx, dadx1); - dadx += 4; - _mm_store_ps(dadx, dadx2); - dadx += 4; - } /* end normal inner loop */ - - if (offset != 0) - { - if (offset == 1) - { - jnrA = jjnr[k]; - j3A = 3*jnrA; - GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz); - GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj); - GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj); - mask = mask1; - } - else if (offset == 2) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - j3A = 3*jnrA; - j3B = 3*jnrB; - GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz); - GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj); - GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj); - mask = mask2; - } - else - { - /* offset must be 3 */ - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - jnrC = jjnr[k+2]; - j3A = 3*jnrA; - j3B = 3*jnrB; - j3C = 3*jnrC; - GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz); - GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj); - GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj); - mask = mask3; - } - - dx = _mm_sub_ps(ix, jx); - dy = _mm_sub_ps(iy, jy); - dz = _mm_sub_ps(iz, jz); - - rsq = gmx_mm_calc_rsq_ps(dx, dy, dz); - - rinv = gmx_mm_invsqrt_ps(rsq); - r = _mm_mul_ps(rsq, rinv); - - /* Compute raj_inv aj1-4 */ - raj_inv = gmx_mm_inv_ps(raj); - - /* Evaluate influence of atom aj -> ai */ - t1 = _mm_add_ps(r, sk_aj); - obc_mask1 = _mm_cmplt_ps(rai, t1); - obc_mask1 = _mm_and_ps(obc_mask1, mask); - - if (_mm_movemask_ps(obc_mask1)) - { - t2 = _mm_sub_ps(r, sk_aj); - t3 = _mm_sub_ps(sk_aj, r); - obc_mask2 = _mm_cmplt_ps(rai, t2); - obc_mask3 = _mm_cmplt_ps(rai, t3); - - uij = gmx_mm_inv_ps(t1); - lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)), - _mm_andnot_ps(obc_mask2, rai_inv)); - dlij = _mm_and_ps(one, obc_mask2); - uij2 = _mm_mul_ps(uij, uij); - uij3 = _mm_mul_ps(uij2, uij); - lij2 = _mm_mul_ps(lij, lij); - lij3 = _mm_mul_ps(lij2, lij); - diff2 = _mm_sub_ps(uij2, lij2); - lij_inv = gmx_mm_invsqrt_ps(lij2); - sk2_aj = _mm_mul_ps(sk_aj, sk_aj); - sk2_rinv = _mm_mul_ps(sk2_aj, rinv); - prod = _mm_mul_ps(onefourth, sk2_rinv); - logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv)); - t1 = _mm_sub_ps(lij, uij); - t2 = _mm_mul_ps(diff2, - _mm_sub_ps(_mm_mul_ps(onefourth, r), - prod)); - t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm)); - t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3)); - t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij)); - t4 = _mm_and_ps(t4, obc_mask3); - t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4)); - sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1)); - t1 = _mm_add_ps(_mm_mul_ps(half, lij2), - _mm_mul_ps(prod, lij3)); - t1 = _mm_sub_ps(t1, - _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(lij, rinv), - _mm_mul_ps(lij3, r)))); - t2 = _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(uij, rinv), - _mm_mul_ps(uij3, r))); - t2 = _mm_sub_ps(t2, - _mm_add_ps(_mm_mul_ps(half, uij2), - _mm_mul_ps(prod, uij3))); - t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm), - _mm_mul_ps(rinv, rinv)); - t3 = _mm_sub_ps(t3, - _mm_mul_ps(_mm_mul_ps(diff2, oneeighth), - _mm_add_ps(one, - _mm_mul_ps(sk2_rinv, rinv)))); - t1 = _mm_mul_ps(rinv, - _mm_add_ps(_mm_mul_ps(dlij, t1), - _mm_add_ps(t2, t3))); - dadx1 = _mm_and_ps(t1, obc_mask1); - } - else - { - dadx1 = _mm_setzero_ps(); - } - - /* Evaluate influence of atom ai -> aj */ - t1 = _mm_add_ps(r, sk_ai); - obc_mask1 = _mm_cmplt_ps(raj, t1); - obc_mask1 = _mm_and_ps(obc_mask1, mask); - - if (_mm_movemask_ps(obc_mask1)) - { - t2 = _mm_sub_ps(r, sk_ai); - t3 = _mm_sub_ps(sk_ai, r); - obc_mask2 = _mm_cmplt_ps(raj, t2); - obc_mask3 = _mm_cmplt_ps(raj, t3); - - uij = gmx_mm_inv_ps(t1); - lij = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)), - _mm_andnot_ps(obc_mask2, raj_inv)); - dlij = _mm_and_ps(one, obc_mask2); - uij2 = _mm_mul_ps(uij, uij); - uij3 = _mm_mul_ps(uij2, uij); - lij2 = _mm_mul_ps(lij, lij); - lij3 = _mm_mul_ps(lij2, lij); - diff2 = _mm_sub_ps(uij2, lij2); - lij_inv = gmx_mm_invsqrt_ps(lij2); - sk2_rinv = _mm_mul_ps(sk2_ai, rinv); - prod = _mm_mul_ps(onefourth, sk2_rinv); - logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv)); - t1 = _mm_sub_ps(lij, uij); - t2 = _mm_mul_ps(diff2, - _mm_sub_ps(_mm_mul_ps(onefourth, r), - prod)); - t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm)); - t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3)); - t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij)); - t4 = _mm_and_ps(t4, obc_mask3); - t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4)); - - tmp = _mm_and_ps(t1, obc_mask1); - - t1 = _mm_add_ps(_mm_mul_ps(half, lij2), - _mm_mul_ps(prod, lij3)); - t1 = _mm_sub_ps(t1, - _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(lij, rinv), - _mm_mul_ps(lij3, r)))); - t2 = _mm_mul_ps(onefourth, - _mm_add_ps(_mm_mul_ps(uij, rinv), - _mm_mul_ps(uij3, r))); - t2 = _mm_sub_ps(t2, - _mm_add_ps(_mm_mul_ps(half, uij2), - _mm_mul_ps(prod, uij3))); - t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm), - _mm_mul_ps(rinv, rinv)); - t3 = _mm_sub_ps(t3, - _mm_mul_ps(_mm_mul_ps(diff2, oneeighth), - _mm_add_ps(one, - _mm_mul_ps(sk2_rinv, rinv)))); - t1 = _mm_mul_ps(rinv, - _mm_add_ps(_mm_mul_ps(dlij, t1), - _mm_add_ps(t2, t3))); - dadx2 = _mm_and_ps(t1, obc_mask1); - } - else - { - dadx2 = _mm_setzero_ps(); - tmp = _mm_setzero_ps(); - } - - _mm_store_ps(dadx, dadx1); - dadx += 4; - _mm_store_ps(dadx, dadx2); - dadx += 4; - - if (offset == 1) - { - GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp); - } - else if (offset == 2) - { - GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp); - } - else - { - /* offset must be 3 */ - GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp); - } - - } - GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii); - - } - - /* Parallel summations */ - if (DOMAINDECOMP(cr)) - { - dd_atom_sum_real(cr->dd, work); - } - - if (gb_algorithm == egbHCT) - { - /* HCT */ - for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */ - { - if (born->use[i] != 0) - { - rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset; - sum = 1.0/rr - work[i]; - min_rad = rr + doffset; - rad = 1.0/sum; - - born->bRad[i] = rad > min_rad ? rad : min_rad; - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - } - } - - /* Extra communication required for DD */ - if (DOMAINDECOMP(cr)) - { - dd_atom_spread_real(cr->dd, born->bRad); - dd_atom_spread_real(cr->dd, fr->invsqrta); - } - } - else - { - /* OBC */ - for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */ - { - if (born->use[i] != 0) - { - rr = top->atomtypes.gb_radius[md->typeA[i]]; - rr_inv2 = 1.0/rr; - rr = rr-doffset; - rr_inv = 1.0/rr; - sum = rr * work[i]; - sum2 = sum * sum; - sum3 = sum2 * sum; - - tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3); - born->bRad[i] = rr_inv - tsum*rr_inv2; - born->bRad[i] = 1.0 / born->bRad[i]; - - fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); - - tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2); - born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2; - } - } - /* Extra (local) communication required for DD */ - if (DOMAINDECOMP(cr)) - { - dd_atom_spread_real(cr->dd, born->bRad); - dd_atom_spread_real(cr->dd, fr->invsqrta); - dd_atom_spread_real(cr->dd, born->drobc); - } - } - - - - return 0; -} - - - -float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda, - float *x, float *f, float *fshift, float *shiftvec, - int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md) -{ - int i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1; - int jnrA, jnrB, jnrC, jnrD; - int j3A, j3B, j3C, j3D; - int jnrE, jnrF, jnrG, jnrH; - int j3E, j3F, j3G, j3H; - int * jjnr; - - float rbi, shX, shY, shZ; - float *rb; - - __m128 ix, iy, iz; - __m128 jx, jy, jz; - __m128 jxB, jyB, jzB; - __m128 fix, fiy, fiz; - __m128 dx, dy, dz; - __m128 tx, ty, tz; - __m128 dxB, dyB, dzB; - __m128 txB, tyB, tzB; - - __m128 rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB; - __m128 xmm1, xmm2, xmm3; - - const __m128 two = _mm_set1_ps(2.0f); - - rb = born->work; - - jjnr = nl->jjnr; - - /* Loop to get the proper form for the Born radius term, sse style */ - offset = natoms%4; - - n0 = 0; - n1 = natoms; - - if (gb_algorithm == egbSTILL) - { - for (i = n0; i < n1; i++) - { - rbi = born->bRad[i]; - rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0; - } - } - else if (gb_algorithm == egbHCT) - { - for (i = n0; i < n1; i++) - { - rbi = born->bRad[i]; - rb[i] = rbi * rbi * dvda[i]; - } - } - else if (gb_algorithm == egbOBC) - { - for (i = n0; i < n1; i++) - { - rbi = born->bRad[i]; - rb[i] = rbi * rbi * born->drobc[i] * dvda[i]; - } - } - - jz = _mm_setzero_ps(); - - n = j3A = j3B = j3C = j3D = 0; - - for (i = 0; i < nl->nri; i++) - { - ii = nl->iinr[i]; - ii3 = ii*3; - is3 = 3*nl->shift[i]; - shX = shiftvec[is3]; - shY = shiftvec[is3+1]; - shZ = shiftvec[is3+2]; - nj0 = nl->jindex[i]; - nj1 = nl->jindex[i+1]; - - ix = _mm_set1_ps(shX+x[ii3+0]); - iy = _mm_set1_ps(shY+x[ii3+1]); - iz = _mm_set1_ps(shZ+x[ii3+2]); - - offset = (nj1-nj0)%4; - - rbai = _mm_load1_ps(rb+ii); - fix = _mm_setzero_ps(); - fiy = _mm_setzero_ps(); - fiz = _mm_setzero_ps(); - - - for (k = nj0; k < nj1-offset; k += 4) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - jnrC = jjnr[k+2]; - jnrD = jjnr[k+3]; - - j3A = 3*jnrA; - j3B = 3*jnrB; - j3C = 3*jnrC; - j3D = 3*jnrD; - - GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz); - - dx = _mm_sub_ps(ix, jx); - dy = _mm_sub_ps(iy, jy); - dz = _mm_sub_ps(iz, jz); - - GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj); - - /* load chain rule terms for j1-4 */ - f_gb = _mm_load_ps(dadx); - dadx += 4; - f_gb_ai = _mm_load_ps(dadx); - dadx += 4; - - /* calculate scalar force */ - f_gb = _mm_mul_ps(f_gb, rbai); - f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj); - f_gb = _mm_add_ps(f_gb, f_gb_ai); - - tx = _mm_mul_ps(f_gb, dx); - ty = _mm_mul_ps(f_gb, dy); - tz = _mm_mul_ps(f_gb, dz); - - fix = _mm_add_ps(fix, tx); - fiy = _mm_add_ps(fiy, ty); - fiz = _mm_add_ps(fiz, tz); - - GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz); - } - - /*deal with odd elements */ - if (offset != 0) - { - if (offset == 1) - { - jnrA = jjnr[k]; - j3A = 3*jnrA; - GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz); - GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj); - } - else if (offset == 2) - { - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - j3A = 3*jnrA; - j3B = 3*jnrB; - GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz); - GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj); - } - else - { - /* offset must be 3 */ - jnrA = jjnr[k]; - jnrB = jjnr[k+1]; - jnrC = jjnr[k+2]; - j3A = 3*jnrA; - j3B = 3*jnrB; - j3C = 3*jnrC; - GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz); - GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj); - } - - dx = _mm_sub_ps(ix, jx); - dy = _mm_sub_ps(iy, jy); - dz = _mm_sub_ps(iz, jz); - - /* load chain rule terms for j1-4 */ - f_gb = _mm_load_ps(dadx); - dadx += 4; - f_gb_ai = _mm_load_ps(dadx); - dadx += 4; - - /* calculate scalar force */ - f_gb = _mm_mul_ps(f_gb, rbai); - f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj); - f_gb = _mm_add_ps(f_gb, f_gb_ai); - - tx = _mm_mul_ps(f_gb, dx); - ty = _mm_mul_ps(f_gb, dy); - tz = _mm_mul_ps(f_gb, dz); - - fix = _mm_add_ps(fix, tx); - fiy = _mm_add_ps(fiy, ty); - fiz = _mm_add_ps(fiz, tz); - - if (offset == 1) - { - GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz); - } - else if (offset == 2) - { - GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz); - } - else - { - /* offset must be 3 */ - GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz); - } - } - - /* fix/fiy/fiz now contain four partial force terms, that all should be - * added to the i particle forces and shift forces. - */ - gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3); - } - - return 0; -} - - -#else -/* keep compiler happy */ -int genborn_sse_dummy; - -#endif /* SSE intrinsics available */ diff --git a/src/gromacs/mdlib/genborn_sse2_single.h b/src/gromacs/mdlib/genborn_sse2_single.h deleted file mode 100644 index 6753e0e17f..0000000000 --- a/src/gromacs/mdlib/genborn_sse2_single.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2008, The GROMACS development team. - * Copyright (c) 2013,2014, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#ifndef _genborn_sse_h -#define _genborn_sse_h - -#include "gromacs/legacyheaders/typedefs.h" - -float -calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda, - float *xd, float *f, float *fshift, float *shift_vec, - int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md); - -int -calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top, - float *x, t_nblist *nl, gmx_genborn_t *born); - -int -calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top, - float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm); - -#endif /* _genborn_sse_h */