*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/legacyheaders/types/commrec.h"
#include "gromacs/math/units.h"
#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/genborn_allvsall.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/pbcutil/mshift.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/smalloc.h"
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-# ifdef GMX_DOUBLE
-# include "gromacs/mdlib/genborn_allvsall_sse2_double.h"
-# include "gromacs/mdlib/genborn_sse2_double.h"
-# else
-# include "gromacs/mdlib/genborn_allvsall_sse2_single.h"
-# include "gromacs/mdlib/genborn_sse2_single.h"
-# endif /* GMX_DOUBLE */
-#endif /* SSE or AVX present */
-
-#include "gromacs/mdlib/genborn_allvsall.h"
-
-/*#define DISABLE_SSE*/
typedef struct {
int shift;
if (ir->gb_algorithm == egbSTILL)
{
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
- if (fr->use_simd_kernels)
- {
-# ifdef GMX_DOUBLE
- genborn_allvsall_calc_still_radii_sse2_double(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb);
-# else
- genborn_allvsall_calc_still_radii_sse2_single(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb);
-# endif
- }
- else
- {
- genborn_allvsall_calc_still_radii(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb);
- }
-#else
genborn_allvsall_calc_still_radii(fr, md, born, top, x[0], &fr->AllvsAll_workgb);
-#endif
/* 13 flops in outer loop, 47 flops in inner loop */
inc_nrnb(nrnb, eNR_BORN_AVA_RADII_STILL, md->homenr*13+cnt*47);
}
else if (ir->gb_algorithm == egbHCT || ir->gb_algorithm == egbOBC)
{
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
- if (fr->use_simd_kernels)
- {
-# ifdef GMX_DOUBLE
- genborn_allvsall_calc_hct_obc_radii_sse2_double(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb);
-# else
- genborn_allvsall_calc_hct_obc_radii_sse2_single(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb);
-# endif
- }
- else
- {
- genborn_allvsall_calc_hct_obc_radii(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb);
- }
-#else
genborn_allvsall_calc_hct_obc_radii(fr, md, born, ir->gb_algorithm, top, x[0], &fr->AllvsAll_workgb);
-#endif
/* 24 flops in outer loop, 183 in inner */
inc_nrnb(nrnb, eNR_BORN_AVA_RADII_HCT_OBC, md->homenr*24+cnt*183);
}
/* Switch for determining which algorithm to use for Born radii calculation */
#ifdef GMX_DOUBLE
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
- /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
- switch (ir->gb_algorithm)
- {
- case egbSTILL:
- if (fr->use_simd_kernels)
- {
- calc_gb_rad_still_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born);
- }
- else
- {
- calc_gb_rad_still(cr, fr, top, x, nl, born, md);
- }
- break;
- case egbHCT:
- if (fr->use_simd_kernels)
- {
- calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm);
- }
- else
- {
- calc_gb_rad_hct(cr, fr, top, x, nl, born, md);
- }
- break;
- case egbOBC:
- if (fr->use_simd_kernels)
- {
- calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm);
- }
- else
- {
- calc_gb_rad_obc(cr, fr, born->nr, top, x, nl, born, md);
- }
- break;
-
- default:
- gmx_fatal(FARGS, "Unknown double precision sse-enabled algorithm for Born radii calculation: %d", ir->gb_algorithm);
- }
-#else
switch (ir->gb_algorithm)
{
case egbSTILL:
gmx_fatal(FARGS, "Unknown double precision algorithm for Born radii calculation: %d", ir->gb_algorithm);
}
-#endif
-
#else
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
- /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
- switch (ir->gb_algorithm)
- {
- case egbSTILL:
- if (fr->use_simd_kernels)
- {
- calc_gb_rad_still_sse2_single(cr, fr, born->nr, top, x[0], nl, born);
- }
- else
- {
- calc_gb_rad_still(cr, fr, top, x, nl, born, md);
- }
- break;
- case egbHCT:
- if (fr->use_simd_kernels)
- {
- calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm);
- }
- else
- {
- calc_gb_rad_hct(cr, fr, top, x, nl, born, md);
- }
- break;
-
- case egbOBC:
- if (fr->use_simd_kernels)
- {
- calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm);
- }
- else
- {
- calc_gb_rad_obc(cr, fr, born->nr, top, x, nl, born, md);
- }
- break;
-
- default:
- gmx_fatal(FARGS, "Unknown sse-enabled algorithm for Born radii calculation: %d", ir->gb_algorithm);
- }
-
-#else
switch (ir->gb_algorithm)
{
case egbSTILL:
gmx_fatal(FARGS, "Unknown algorithm for Born radii calculation: %d", ir->gb_algorithm);
}
-#endif /* Single precision sse */
-
#endif /* Double or single precision */
if (fr->bAllvsAll == FALSE)
if (fr->bAllvsAll)
{
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
- if (fr->use_simd_kernels)
- {
-# ifdef GMX_DOUBLE
- genborn_allvsall_calc_chainrule_sse2_double(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
-# else
- genborn_allvsall_calc_chainrule_sse2_single(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
-# endif
- }
- else
- {
- genborn_allvsall_calc_chainrule(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
- }
-#else
genborn_allvsall_calc_chainrule(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
-#endif
cnt = md->homenr*(md->nr/2+1);
/* 9 flops for outer loop, 15 for inner */
inc_nrnb(nrnb, eNR_BORN_AVA_CHAINRULE, md->homenr*9+cnt*15);
return;
}
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
- if (fr->use_simd_kernels)
- {
-# ifdef GMX_DOUBLE
- calc_gb_chainrule_sse2_double(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x[0],
- f[0], fr->fshift[0], fr->shift_vec[0], gb_algorithm, born, md);
-# else
- calc_gb_chainrule_sse2_single(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x[0],
- f[0], fr->fshift[0], fr->shift_vec[0], gb_algorithm, born, md);
-# endif
- }
- else
- {
- calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
- x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
- }
-#else
calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
x, f, fr->fshift, fr->shift_vec, gb_algorithm, born);
-#endif
if (!fr->bAllvsAll)
{
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2012,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include <math.h>
-
-#include "gromacs/legacyheaders/genborn.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/types/simple.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/genborn_allvsall.h"
-#include "gromacs/utility/smalloc.h"
-
-
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-
-#include <gmx_sse2_double.h>
-
-
-#define SIMD_WIDTH 2
-#define UNROLLI 2
-#define UNROLLJ 2
-
-
-
-
-
-
-
-
-
-typedef struct
-{
- int * jindex_gb;
- int ** prologue_mask_gb;
- int ** epilogue_mask;
- int * imask;
- double * gb_radius;
- double * workparam;
- double * work;
- double * x_align;
- double * y_align;
- double * z_align;
- double * fx_align;
- double * fy_align;
- double * fz_align;
-}
-gmx_allvsallgb2_data_t;
-
-
-static int
-calc_maxoffset(int i, int natoms)
-{
- int maxoffset;
-
- if ((natoms % 2) == 1)
- {
- /* Odd number of atoms, easy */
- maxoffset = natoms/2;
- }
- else if ((natoms % 4) == 0)
- {
- /* Multiple of four is hard */
- if (i < natoms/2)
- {
- if ((i % 2) == 0)
- {
- maxoffset = natoms/2;
- }
- else
- {
- maxoffset = natoms/2-1;
- }
- }
- else
- {
- if ((i % 2) == 1)
- {
- maxoffset = natoms/2;
- }
- else
- {
- maxoffset = natoms/2-1;
- }
- }
- }
- else
- {
- /* natoms/2 = odd */
- if ((i % 2) == 0)
- {
- maxoffset = natoms/2;
- }
- else
- {
- maxoffset = natoms/2-1;
- }
- }
-
- return maxoffset;
-}
-
-static void
-setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
- t_ilist * ilist,
- int start,
- int end,
- int natoms,
- gmx_bool bInclude12,
- gmx_bool bInclude13,
- gmx_bool bInclude14)
-{
- int i, j, k, tp;
- int a1, a2;
- int ni0, ni1, nj0, nj1, nj;
- int imin, imax, iexcl;
- int max_offset;
- int max_excl_offset;
- int firstinteraction;
- int ibase;
- int *pi;
-
- /* This routine can appear to be a bit complex, but it is mostly book-keeping.
- * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
- * whether they should interact or not.
- *
- * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
- * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
- * we create a jindex array with three elements per i atom: the starting point, the point to
- * which we need to check exclusions, and the end point.
- * This way we only have to allocate a short exclusion mask per i atom.
- */
-
- ni0 = (start/UNROLLI)*UNROLLI;
- ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
-
- /* Set the interaction mask to only enable the i atoms we want to include */
- snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH));
- aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
- for (i = 0; i < natoms+UNROLLI; i++)
- {
- aadata->imask[2*i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
- aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
- }
-
- /* Allocate memory for our modified jindex array */
- snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
- for (i = 0; i < 4*(natoms+UNROLLI); i++)
- {
- aadata->jindex_gb[i] = 0;
- }
-
- /* Create the exclusion masks for the prologue part */
- snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
-
- /* First zero everything to avoid uninitialized data */
- for (i = 0; i < natoms+UNROLLI; i++)
- {
- aadata->prologue_mask_gb[i] = NULL;
- }
-
- /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
- for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
- {
- max_excl_offset = -1;
-
- /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
- imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
-
- /* Which atom is the first we (might) interact with? */
- imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
- for (i = ibase; i < imax; i++)
- {
- /* Before exclusions, which atom is the first we (might) interact with? */
- firstinteraction = i+1;
- max_offset = calc_maxoffset(i, natoms);
-
- if (!bInclude12)
- {
- for (j = 0; j < ilist[F_GB12].nr; j += 3)
- {
- a1 = ilist[F_GB12].iatoms[j+1];
- a2 = ilist[F_GB12].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k == firstinteraction)
- {
- firstinteraction++;
- }
- }
- }
- if (!bInclude13)
- {
- for (j = 0; j < ilist[F_GB13].nr; j += 3)
- {
- a1 = ilist[F_GB13].iatoms[j+1];
- a2 = ilist[F_GB13].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k == firstinteraction)
- {
- firstinteraction++;
- }
- }
- }
- if (!bInclude14)
- {
- for (j = 0; j < ilist[F_GB14].nr; j += 3)
- {
- a1 = ilist[F_GB14].iatoms[j+1];
- a2 = ilist[F_GB14].iatoms[j+2];
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k == firstinteraction)
- {
- firstinteraction++;
- }
- }
- }
- imin = (firstinteraction < imin) ? firstinteraction : imin;
- }
- /* round down to j unrolling factor */
- imin = (imin/UNROLLJ)*UNROLLJ;
-
- for (i = ibase; i < imax; i++)
- {
- max_offset = calc_maxoffset(i, natoms);
-
- if (!bInclude12)
- {
- for (j = 0; j < ilist[F_GB12].nr; j += 3)
- {
- a1 = ilist[F_GB12].iatoms[j+1];
- a2 = ilist[F_GB12].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k < imin)
- {
- k += natoms;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
-
- k = k - imin;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
- max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
- }
- }
- if (!bInclude13)
- {
- for (j = 0; j < ilist[F_GB13].nr; j += 3)
- {
- a1 = ilist[F_GB13].iatoms[j+1];
- a2 = ilist[F_GB13].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k < imin)
- {
- k += natoms;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
-
- k = k - imin;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
- max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
- }
- }
- if (!bInclude14)
- {
- for (j = 0; j < ilist[F_GB14].nr; j += 3)
- {
- a1 = ilist[F_GB14].iatoms[j+1];
- a2 = ilist[F_GB14].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k < imin)
- {
- k += natoms;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
-
- k = k - imin;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
- max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
- }
- }
- }
-
- /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
- max_excl_offset++;
- /* round up to j unrolling factor */
- max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
-
- /* Set all the prologue masks length to this value (even for i>end) */
- for (i = ibase; i < ibase+UNROLLI; i++)
- {
- aadata->jindex_gb[4*i] = imin;
- aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
- }
- }
-
- /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
- for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
- {
- for (i = ibase; i < ibase+UNROLLI; i++)
- {
- nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
- imin = aadata->jindex_gb[4*i];
-
- /* Allocate aligned memory */
- snew(pi, 2*(nj+2*SIMD_WIDTH));
- aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-
- max_offset = calc_maxoffset(i, natoms);
-
- /* Include interactions i+1 <= j < i+maxoffset */
- for (k = 0; k < nj; k++)
- {
- j = imin + k;
-
- if ( (j > i) && (j <= i+max_offset) )
- {
- aadata->prologue_mask_gb[i][2*k] = 0xFFFFFFFF;
- aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
- }
- else
- {
- aadata->prologue_mask_gb[i][2*k] = 0;
- aadata->prologue_mask_gb[i][2*k+1] = 0;
- }
- }
-
- /* Clear out the explicit exclusions */
- if (i < end)
- {
- if (!bInclude12)
- {
- for (j = 0; j < ilist[F_GB12].nr; j += 3)
- {
- a1 = ilist[F_GB12].iatoms[j+1];
- a2 = ilist[F_GB12].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
- k = k-i;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
-
- k = k+i-imin;
- if (k >= 0)
- {
- aadata->prologue_mask_gb[i][2*k] = 0;
- aadata->prologue_mask_gb[i][2*k+1] = 0;
- }
- }
- }
- if (!bInclude13)
- {
- for (j = 0; j < ilist[F_GB13].nr; j += 3)
- {
- a1 = ilist[F_GB13].iatoms[j+1];
- a2 = ilist[F_GB13].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
- k = k-i;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
-
- k = k+i-imin;
- if (k >= 0)
- {
- aadata->prologue_mask_gb[i][2*k] = 0;
- aadata->prologue_mask_gb[i][2*k+1] = 0;
- }
- }
- }
- if (!bInclude14)
- {
- for (j = 0; j < ilist[F_GB14].nr; j += 3)
- {
- a1 = ilist[F_GB14].iatoms[j+1];
- a2 = ilist[F_GB14].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
- k = k-i;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
-
- k = k+i-imin;
- if (k >= 0)
- {
- aadata->prologue_mask_gb[i][2*k] = 0;
- aadata->prologue_mask_gb[i][2*k+1] = 0;
- }
- }
- }
- }
- }
- }
-
- /* Construct the epilogue mask - this just contains the check for maxoffset */
- snew(aadata->epilogue_mask, natoms+UNROLLI);
-
- /* First zero everything to avoid uninitialized data */
- for (i = 0; i < natoms+UNROLLI; i++)
- {
- aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
- aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
- aadata->epilogue_mask[i] = NULL;
- }
-
- for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
- {
- /* Find the lowest index for which we need to use the epilogue */
- imin = ibase;
- max_offset = calc_maxoffset(imin, natoms);
-
- imin = imin + 1 + max_offset;
-
- /* Find largest index for which we need to use the epilogue */
- imax = ibase + UNROLLI-1;
- imax = (imax < end) ? imax : end;
-
- max_offset = calc_maxoffset(imax, natoms);
- imax = imax + 1 + max_offset + UNROLLJ - 1;
-
- for (i = ibase; i < ibase+UNROLLI; i++)
- {
- /* Start of epilogue - round down to j tile limit */
- aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
- /* Make sure we dont overlap - for small systems everything is done in the prologue */
- aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
- /* Round upwards to j tile limit */
- aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
- /* Make sure we dont have a negative range for the epilogue */
- aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
- }
- }
-
- /* And fill it with data... */
-
- for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
- {
- for (i = ibase; i < ibase+UNROLLI; i++)
- {
-
- nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
-
- /* Allocate aligned memory */
- snew(pi, 2*(nj+2*SIMD_WIDTH));
- aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-
- max_offset = calc_maxoffset(i, natoms);
-
- for (k = 0; k < nj; k++)
- {
- j = aadata->jindex_gb[4*i+2] + k;
- aadata->epilogue_mask[i][2*k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
- aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
- }
- }
- }
-}
-
-
-static void
-genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
- gmx_localtop_t * top,
- gmx_genborn_t * born,
- t_mdatoms * mdatoms,
- double radius_offset,
- int gb_algorithm,
- gmx_bool bInclude12,
- gmx_bool bInclude13,
- gmx_bool bInclude14)
-{
- int i, j, idx;
- int natoms;
- gmx_allvsallgb2_data_t *aadata;
- double *p;
-
- natoms = mdatoms->nr;
-
- snew(aadata, 1);
- *p_aadata = aadata;
-
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-
- snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
- aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-
- snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
- aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-
- snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
- aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-
- for (i = 0; i < mdatoms->nr; i++)
- {
- aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
- if (gb_algorithm == egbSTILL)
- {
- aadata->workparam[i] = born->vsolv[i];
- }
- else if (gb_algorithm == egbOBC)
- {
- aadata->workparam[i] = born->param[i];
- }
- aadata->work[i] = 0.0;
- }
- for (i = 0; i < mdatoms->nr; i++)
- {
- aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
- aadata->workparam[natoms+i] = aadata->workparam[i];
- aadata->work[natoms+i] = aadata->work[i];
- }
-
- for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
- {
- aadata->x_align[i] = 0.0;
- aadata->y_align[i] = 0.0;
- aadata->z_align[i] = 0.0;
- aadata->fx_align[i] = 0.0;
- aadata->fy_align[i] = 0.0;
- aadata->fz_align[i] = 0.0;
- }
-
- setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
- bInclude12, bInclude13, bInclude14);
-}
-
-
-/*
- * This routine apparently hits a compiler bug visual studio has had 'forever'.
- * It is present both in VS2005 and VS2008, and the only way around it is to
- * decrease optimization. We do that with at pragma, and only for MSVC, so it
- * will not hurt any of the well-behaving and supported compilers out there.
- * MS: Fix your compiler, it sucks like a black hole!
- */
-#ifdef _MSC_VER
-#pragma optimize("t",off)
-#endif
-
-int
-genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- gmx_localtop_t * top,
- double * x,
- t_commrec * cr,
- void * paadata)
-{
- gmx_allvsallgb2_data_t *aadata;
- int natoms;
- int ni0, ni1;
- int nj0, nj1, nj2, nj3;
- int i, j, k, n;
- int * mask;
- int * pmask0;
- int * pmask1;
- int * emask0;
- int * emask1;
- double ix, iy, iz;
- double jx, jy, jz;
- double dx, dy, dz;
- double rsq, rinv;
- double gpi, rai, vai;
- double prod_ai;
- double irsq, idr4, idr6;
- double raj, rvdw, ratio;
- double vaj, ccf, dccf, theta, cosq;
- double term, prod, icf4, icf6, gpi2, factor, sinq;
- double * gb_radius;
- double * vsolv;
- double * work;
- double tmpsum[2];
- double * x_align;
- double * y_align;
- double * z_align;
- int * jindex;
- double * dadx;
-
- __m128d ix_SSE0, iy_SSE0, iz_SSE0;
- __m128d ix_SSE1, iy_SSE1, iz_SSE1;
- __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
- __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
- __m128d imask_SSE0, jmask_SSE0;
- __m128d imask_SSE1, jmask_SSE1;
- __m128d jx_SSE, jy_SSE, jz_SSE;
- __m128d dx_SSE0, dy_SSE0, dz_SSE0;
- __m128d dx_SSE1, dy_SSE1, dz_SSE1;
- __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
- __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
- __m128d raj_SSE, vaj_SSE, prod_SSE;
- __m128d rvdw_SSE0, ratio_SSE0;
- __m128d rvdw_SSE1, ratio_SSE1;
- __m128d theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
- __m128d theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
- __m128d ccf_SSE0, dccf_SSE0;
- __m128d ccf_SSE1, dccf_SSE1;
- __m128d icf4_SSE0, icf6_SSE0;
- __m128d icf4_SSE1, icf6_SSE1;
- __m128d half_SSE, one_SSE, two_SSE, four_SSE;
- __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
-
- natoms = mdatoms->nr;
- ni0 = 0;
- ni1 = mdatoms->homenr;
-
- n = 0;
-
- aadata = *((gmx_allvsallgb2_data_t **)paadata);
-
-
- if (aadata == NULL)
- {
- genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
- egbSTILL, FALSE, FALSE, TRUE);
- *((gmx_allvsallgb2_data_t **)paadata) = aadata;
- }
-
- x_align = aadata->x_align;
- y_align = aadata->y_align;
- z_align = aadata->z_align;
-
- gb_radius = aadata->gb_radius;
- vsolv = aadata->workparam;
- work = aadata->work;
- jindex = aadata->jindex_gb;
- dadx = fr->dadx;
-
- still_p4_SSE = _mm_set1_pd(STILL_P4);
- still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
- still_pip5_SSE = _mm_set1_pd(STILL_PIP5);
- half_SSE = _mm_set1_pd(0.5);
- one_SSE = _mm_set1_pd(1.0);
- two_SSE = _mm_set1_pd(2.0);
- four_SSE = _mm_set1_pd(4.0);
-
- /* This will be summed, so it has to extend to natoms + buffer */
- for (i = 0; i < natoms+1+natoms/2; i++)
- {
- work[i] = 0;
- }
-
- for (i = ni0; i < ni1+1+natoms/2; i++)
- {
- k = i%natoms;
- x_align[i] = x[3*k];
- y_align[i] = x[3*k+1];
- z_align[i] = x[3*k+2];
- work[i] = 0;
- }
-
- for (i = ni0; i < ni1; i += UNROLLI)
- {
- /* We assume shifts are NOT used for all-vs-all interactions */
- /* Load i atom data */
- ix_SSE0 = _mm_load1_pd(x_align+i);
- iy_SSE0 = _mm_load1_pd(y_align+i);
- iz_SSE0 = _mm_load1_pd(z_align+i);
- ix_SSE1 = _mm_load1_pd(x_align+i+1);
- iy_SSE1 = _mm_load1_pd(y_align+i+1);
- iz_SSE1 = _mm_load1_pd(z_align+i+1);
-
- gpi_SSE0 = _mm_setzero_pd();
- gpi_SSE1 = _mm_setzero_pd();
-
- rai_SSE0 = _mm_load1_pd(gb_radius+i);
- rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
-
- prod_ai_SSE0 = _mm_set1_pd(STILL_P4*vsolv[i]);
- prod_ai_SSE1 = _mm_set1_pd(STILL_P4*vsolv[i+1]);
-
- /* Load limits for loop over neighbors */
- nj0 = jindex[4*i];
- nj1 = jindex[4*i+1];
- nj2 = jindex[4*i+2];
- nj3 = jindex[4*i+3];
-
- pmask0 = aadata->prologue_mask_gb[i];
- pmask1 = aadata->prologue_mask_gb[i+1];
- emask0 = aadata->epilogue_mask[i];
- emask1 = aadata->epilogue_mask[i+1];
-
- imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
- imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
-
- /* Prologue part, including exclusion mask */
- for (j = nj0; j < nj1; j += UNROLLJ)
- {
- jmask_SSE0 = _mm_load_pd((double *)pmask0);
- jmask_SSE1 = _mm_load_pd((double *)pmask1);
- pmask0 += 2*UNROLLJ;
- pmask1 += 2*UNROLLJ;
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_pd(x_align+j);
- jy_SSE = _mm_load_pd(y_align+j);
- jz_SSE = _mm_load_pd(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
- /* Combine masks */
- jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
- jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
- rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
-
- irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
- irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
- idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
- idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
- idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
- idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
-
- raj_SSE = _mm_load_pd(gb_radius+j);
- vaj_SSE = _mm_load_pd(vsolv+j);
-
- rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
- rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
-
- ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
- ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
-
- ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
- ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
- theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
- theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
- gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
- gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
- term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
- term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
- ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
- ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
- dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
- _mm_mul_pd(sinq_SSE0, theta_SSE0));
- dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
- _mm_mul_pd(sinq_SSE1, theta_SSE1));
-
- prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
- icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
- icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
- icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
- icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-
- _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
- _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
- _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
-
-
- gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
- gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
-
- /* Save ai->aj and aj->ai chain rule terms */
- _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
- dadx += 2;
-
- _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
- dadx += 2;
- }
-
- /* Main part, no exclusions */
- for (j = nj1; j < nj2; j += UNROLLJ)
- {
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_pd(x_align+j);
- jy_SSE = _mm_load_pd(y_align+j);
- jz_SSE = _mm_load_pd(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
- rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
-
- irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
- irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
- idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
- idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
- idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
- idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
-
- raj_SSE = _mm_load_pd(gb_radius+j);
-
- rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
- rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
- vaj_SSE = _mm_load_pd(vsolv+j);
-
- ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
- ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
-
- ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
- ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
- theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
- theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
- gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
- gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
- term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
- term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
- ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
- ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
- dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
- _mm_mul_pd(sinq_SSE0, theta_SSE0));
- dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
- _mm_mul_pd(sinq_SSE1, theta_SSE1));
-
- prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE );
- icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
- icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
- icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
- icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-
- _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
- _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
- _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
-
- gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
- gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
-
- /* Save ai->aj and aj->ai chain rule terms */
- _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
- dadx += 2;
-
- _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
- dadx += 2;
- }
- /* Epilogue part, including exclusion mask */
- for (j = nj2; j < nj3; j += UNROLLJ)
- {
- jmask_SSE0 = _mm_load_pd((double *)emask0);
- jmask_SSE1 = _mm_load_pd((double *)emask1);
- emask0 += 2*UNROLLJ;
- emask1 += 2*UNROLLJ;
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_pd(x_align+j);
- jy_SSE = _mm_load_pd(y_align+j);
- jz_SSE = _mm_load_pd(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
- /* Combine masks */
- jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
- jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
- rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
-
- irsq_SSE0 = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
- irsq_SSE1 = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
- idr4_SSE0 = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
- idr4_SSE1 = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
- idr6_SSE0 = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
- idr6_SSE1 = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
-
- raj_SSE = _mm_load_pd(gb_radius+j);
- vaj_SSE = _mm_load_pd(vsolv+j);
-
- rvdw_SSE0 = _mm_add_pd(rai_SSE0, raj_SSE);
- rvdw_SSE1 = _mm_add_pd(rai_SSE1, raj_SSE);
-
- ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
- ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
-
- ratio_SSE0 = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
- ratio_SSE1 = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
- theta_SSE0 = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
- theta_SSE1 = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
- gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
- gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
- term_SSE0 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
- term_SSE1 = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
- ccf_SSE0 = _mm_mul_pd(term_SSE0, term_SSE0);
- ccf_SSE1 = _mm_mul_pd(term_SSE1, term_SSE1);
- dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
- _mm_mul_pd(sinq_SSE0, theta_SSE0));
- dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
- _mm_mul_pd(sinq_SSE1, theta_SSE1));
-
- prod_SSE = _mm_mul_pd(still_p4_SSE, vaj_SSE);
- icf4_SSE0 = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
- icf4_SSE1 = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
- icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
- icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-
- _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
- _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
- _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
-
- gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
- gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
-
- /* Save ai->aj and aj->ai chain rule terms */
- _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
- dadx += 2;
-
- _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
- dadx += 2;
- }
- GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1);
- gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1);
- _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
- }
-
- /* In case we have written anything beyond natoms, move it back.
- * Never mind that we leave stuff above natoms; that will not
- * be accessed later in the routine.
- * In principle this should be a move rather than sum, but this
- * way we dont have to worry about even/odd offsets...
- */
- for (i = natoms; i < ni1+1+natoms/2; i++)
- {
- work[i-natoms] += work[i];
- }
-
- /* Parallel summations would go here if ever implemented with DD */
-
- factor = 0.5 * ONE_4PI_EPS0;
- /* Calculate the radii - should we do all atoms, or just our local ones? */
- for (i = 0; i < natoms; i++)
- {
- if (born->use[i] != 0)
- {
- gpi = born->gpol[i]+work[i];
- gpi2 = gpi * gpi;
- born->bRad[i] = factor*gmx_invsqrt(gpi2);
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
- }
- }
-
- return 0;
-}
-/* Reinstate MSVC optimization */
-#ifdef _MSC_VER
-#pragma optimize("",on)
-#endif
-
-
-int
-genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- int gb_algorithm,
- gmx_localtop_t * top,
- double * x,
- t_commrec * cr,
- void * paadata)
-{
- gmx_allvsallgb2_data_t *aadata;
- int natoms;
- int ni0, ni1;
- int nj0, nj1, nj2, nj3;
- int i, j, k, n;
- int * mask;
- int * pmask0;
- int * pmask1;
- int * emask0;
- int * emask1;
- double * gb_radius;
- double * vsolv;
- double * work;
- double tmpsum[2];
- double * x_align;
- double * y_align;
- double * z_align;
- int * jindex;
- double * dadx;
- double * obc_param;
- double rad, min_rad;
- double rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
-
- __m128d ix_SSE0, iy_SSE0, iz_SSE0;
- __m128d ix_SSE1, iy_SSE1, iz_SSE1;
- __m128d gpi_SSE0, rai_SSE0, prod_ai_SSE0;
- __m128d gpi_SSE1, rai_SSE1, prod_ai_SSE1;
- __m128d imask_SSE0, jmask_SSE0;
- __m128d imask_SSE1, jmask_SSE1;
- __m128d jx_SSE, jy_SSE, jz_SSE;
- __m128d dx_SSE0, dy_SSE0, dz_SSE0;
- __m128d dx_SSE1, dy_SSE1, dz_SSE1;
- __m128d rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
- __m128d rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
- __m128d raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
- __m128d ccf_SSE0, dccf_SSE0, prod_SSE0;
- __m128d ccf_SSE1, dccf_SSE1, prod_SSE1;
- __m128d icf4_SSE0, icf6_SSE0;
- __m128d icf4_SSE1, icf6_SSE1;
- __m128d oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
- __m128d still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
- __m128d rai_inv_SSE0;
- __m128d rai_inv_SSE1;
- __m128d sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
- __m128d sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
- __m128d lij_inv_SSE0, sk2_rinv_SSE0;
- __m128d lij_inv_SSE1, sk2_rinv_SSE1;
- __m128d dr_SSE0;
- __m128d dr_SSE1;
- __m128d t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
- __m128d t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
- __m128d obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
- __m128d obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
- __m128d uij_SSE0, uij2_SSE0, uij3_SSE0;
- __m128d uij_SSE1, uij2_SSE1, uij3_SSE1;
- __m128d lij_SSE0, lij2_SSE0, lij3_SSE0;
- __m128d lij_SSE1, lij2_SSE1, lij3_SSE1;
- __m128d dlij_SSE0, diff2_SSE0, logterm_SSE0;
- __m128d dlij_SSE1, diff2_SSE1, logterm_SSE1;
- __m128d doffset_SSE, tmpSSE;
-
- natoms = mdatoms->nr;
- ni0 = 0;
- ni1 = mdatoms->homenr;
-
- n = 0;
-
- aadata = *((gmx_allvsallgb2_data_t **)paadata);
-
-
- if (aadata == NULL)
- {
- genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
- egbOBC, TRUE, TRUE, TRUE);
- *((gmx_allvsallgb2_data_t **)paadata) = aadata;
- }
-
- x_align = aadata->x_align;
- y_align = aadata->y_align;
- z_align = aadata->z_align;
-
- gb_radius = aadata->gb_radius;
- work = aadata->work;
- jindex = aadata->jindex_gb;
- dadx = fr->dadx;
- obc_param = aadata->workparam;
-
- oneeighth_SSE = _mm_set1_pd(0.125);
- onefourth_SSE = _mm_set1_pd(0.25);
- half_SSE = _mm_set1_pd(0.5);
- one_SSE = _mm_set1_pd(1.0);
- two_SSE = _mm_set1_pd(2.0);
- four_SSE = _mm_set1_pd(4.0);
- doffset_SSE = _mm_set1_pd(born->gb_doffset);
-
- for (i = 0; i < natoms; i++)
- {
- x_align[i] = x[3*i];
- y_align[i] = x[3*i+1];
- z_align[i] = x[3*i+2];
- }
-
- /* Copy again */
- for (i = 0; i < natoms/2+1; i++)
- {
- x_align[natoms+i] = x_align[i];
- y_align[natoms+i] = y_align[i];
- z_align[natoms+i] = z_align[i];
- }
-
- for (i = 0; i < natoms+natoms/2+1; i++)
- {
- work[i] = 0;
- }
-
- for (i = ni0; i < ni1; i += UNROLLI)
- {
- /* We assume shifts are NOT used for all-vs-all interactions */
-
- /* Load i atom data */
- ix_SSE0 = _mm_load1_pd(x_align+i);
- iy_SSE0 = _mm_load1_pd(y_align+i);
- iz_SSE0 = _mm_load1_pd(z_align+i);
- ix_SSE1 = _mm_load1_pd(x_align+i+1);
- iy_SSE1 = _mm_load1_pd(y_align+i+1);
- iz_SSE1 = _mm_load1_pd(z_align+i+1);
-
- rai_SSE0 = _mm_load1_pd(gb_radius+i);
- rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
- rai_inv_SSE0 = gmx_mm_inv_pd(rai_SSE0);
- rai_inv_SSE1 = gmx_mm_inv_pd(rai_SSE1);
-
- sk_ai_SSE0 = _mm_load1_pd(obc_param+i);
- sk_ai_SSE1 = _mm_load1_pd(obc_param+i+1);
- sk2_ai_SSE0 = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0);
- sk2_ai_SSE1 = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1);
-
- sum_ai_SSE0 = _mm_setzero_pd();
- sum_ai_SSE1 = _mm_setzero_pd();
-
- /* Load limits for loop over neighbors */
- nj0 = jindex[4*i];
- nj1 = jindex[4*i+1];
- nj2 = jindex[4*i+2];
- nj3 = jindex[4*i+3];
-
- pmask0 = aadata->prologue_mask_gb[i];
- pmask1 = aadata->prologue_mask_gb[i+1];
- emask0 = aadata->epilogue_mask[i];
- emask1 = aadata->epilogue_mask[i+1];
-
- imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
- imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
-
- /* Prologue part, including exclusion mask */
- for (j = nj0; j < nj1; j += UNROLLJ)
- {
- jmask_SSE0 = _mm_load_pd((double *)pmask0);
- jmask_SSE1 = _mm_load_pd((double *)pmask1);
- pmask0 += 2*UNROLLJ;
- pmask1 += 2*UNROLLJ;
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_pd(x_align+j);
- jy_SSE = _mm_load_pd(y_align+j);
- jz_SSE = _mm_load_pd(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
- /* Combine masks */
- jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
- jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
- rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
-
- dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
- dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
-
- sk_aj_SSE = _mm_load_pd(obc_param+j);
- raj_SSE = _mm_load_pd(gb_radius+j);
- raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
-
- /* Evaluate influence of atom aj -> ai */
- t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
- t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
- t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
- t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
- t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
- t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
-
- obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
- obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
- obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
- obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
- obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
-
- uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
- lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
- _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
- lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
- _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
- dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
- uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
- uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
- lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
- lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
- diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
- lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
- sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
- sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
- prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
- logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-
- t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
- t2_SSE0 = _mm_mul_pd(diff2_SSE0,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_pd(diff2_SSE1,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
- prod_SSE1));
-
- t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
- t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
- t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
- t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
- t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
- t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
- sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-
- t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
- _mm_mul_pd(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
- _mm_mul_pd(prod_SSE1, lij3_SSE1));
- t1_SSE0 = _mm_sub_pd(t1_SSE0,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
- _mm_mul_pd(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_pd(t1_SSE1,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
- _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-
- t2_SSE0 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
- _mm_mul_pd(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
- _mm_mul_pd(uij3_SSE1, dr_SSE1)));
- t2_SSE0 = _mm_sub_pd(t2_SSE0,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
- _mm_mul_pd(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_pd(t2_SSE1,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
- _mm_mul_pd(prod_SSE1, uij3_SSE1)));
- t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
- _mm_mul_pd(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
- _mm_mul_pd(rinv_SSE1, rinv_SSE1));
- t3_SSE0 = _mm_sub_pd(t3_SSE0,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_pd(t3_SSE1,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
- t1_SSE0 = _mm_mul_pd(rinv_SSE0,
- _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
- _mm_add_pd(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_pd(rinv_SSE1,
- _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
- _mm_add_pd(t2_SSE1, t3_SSE1)));
-
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
- dadx += 2;
-
- /* Evaluate influence of atom ai -> aj */
- t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
- t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
- t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
- t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
- t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
- t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
-
- obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
- obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
- obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
- obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
- obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
-
- uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
- lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
- _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
- lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
- _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
- dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
- uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
- uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
- lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
- lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
- diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
- lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
- sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
- prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
- logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
- t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
- t2_SSE0 = _mm_mul_pd(diff2_SSE0,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_pd(diff2_SSE1,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
- t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
- t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
- t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
- t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
- t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
- _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
- _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
- _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
-
- t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
- _mm_mul_pd(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
- _mm_mul_pd(prod_SSE1, lij3_SSE1));
- t1_SSE0 = _mm_sub_pd(t1_SSE0,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
- _mm_mul_pd(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_pd(t1_SSE1,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
- _mm_mul_pd(lij3_SSE1, dr_SSE1))));
- t2_SSE0 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
- _mm_mul_pd(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
- _mm_mul_pd(uij3_SSE1, dr_SSE1)));
- t2_SSE0 = _mm_sub_pd(t2_SSE0,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
- _mm_mul_pd(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_pd(t2_SSE1,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
- _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-
- t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
- _mm_mul_pd(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
- _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-
- t3_SSE0 = _mm_sub_pd(t3_SSE0,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_pd(t3_SSE1,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
-
- t1_SSE0 = _mm_mul_pd(rinv_SSE0,
- _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
- _mm_add_pd(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_pd(rinv_SSE1,
- _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
- _mm_add_pd(t2_SSE1, t3_SSE1)));
-
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
- dadx += 2;
- }
-
- /* Main part, no exclusions */
- for (j = nj1; j < nj2; j += UNROLLJ)
- {
- /* load j atom coordinates */
- jx_SSE = _mm_load_pd(x_align+j);
- jy_SSE = _mm_load_pd(y_align+j);
- jz_SSE = _mm_load_pd(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_pd(rinv_SSE0, imask_SSE0);
- rinv_SSE1 = _mm_and_pd(rinv_SSE1, imask_SSE1);
-
- dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
- dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
-
- sk_aj_SSE = _mm_load_pd(obc_param+j);
- raj_SSE = _mm_load_pd(gb_radius+j);
-
- raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
-
- /* Evaluate influence of atom aj -> ai */
- t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
- t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
- t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
- t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
- t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
- t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
-
- obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
- obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
- obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
- obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
- obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
-
- uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
- lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
- _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
- lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
- _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
- dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
- uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
- uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
- lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
- lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
- diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
- lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
- sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
- sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
- prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
- logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-
- t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
- t2_SSE0 = _mm_mul_pd(diff2_SSE0,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_pd(diff2_SSE1,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
- prod_SSE1));
-
- t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
- t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
- t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
- t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
- t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
- t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
- sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-
- t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
- _mm_mul_pd(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
- _mm_mul_pd(prod_SSE1, lij3_SSE1));
-
- t1_SSE0 = _mm_sub_pd(t1_SSE0,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
- _mm_mul_pd(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_pd(t1_SSE1,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
- _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-
- t2_SSE0 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
- _mm_mul_pd(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
- _mm_mul_pd(uij3_SSE1, dr_SSE1)));
- t2_SSE0 = _mm_sub_pd(t2_SSE0,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
- _mm_mul_pd(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_pd(t2_SSE1,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
- _mm_mul_pd(prod_SSE1, uij3_SSE1)));
- t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
- _mm_mul_pd(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
- _mm_mul_pd(rinv_SSE1, rinv_SSE1));
- t3_SSE0 = _mm_sub_pd(t3_SSE0,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_pd(t3_SSE1,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
- t1_SSE0 = _mm_mul_pd(rinv_SSE0,
- _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
- _mm_add_pd(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_pd(rinv_SSE1,
- _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
- _mm_add_pd(t2_SSE1, t3_SSE1)));
-
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
- dadx += 2;
-
- /* Evaluate influence of atom ai -> aj */
- t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
- t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
- t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
- t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
- t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
- t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
-
- obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
- obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
- obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
- obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
- obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
-
- uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
- lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
- _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
- lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
- _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
- dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
- uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
- uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
- lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
- lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
- diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
- lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
- sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
- prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
- logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
- t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
- t2_SSE0 = _mm_mul_pd(diff2_SSE0,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_pd(diff2_SSE1,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
- t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
- t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
- t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
- t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
- t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
- _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
- _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
- _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
-
- t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
- _mm_mul_pd(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
- _mm_mul_pd(prod_SSE1, lij3_SSE1));
- t1_SSE0 = _mm_sub_pd(t1_SSE0,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
- _mm_mul_pd(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_pd(t1_SSE1,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
- _mm_mul_pd(lij3_SSE1, dr_SSE1))));
- t2_SSE0 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
- _mm_mul_pd(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
- _mm_mul_pd(uij3_SSE1, dr_SSE1)));
- t2_SSE0 = _mm_sub_pd(t2_SSE0,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
- _mm_mul_pd(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_pd(t2_SSE1,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
- _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-
- t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
- _mm_mul_pd(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
- _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-
- t3_SSE0 = _mm_sub_pd(t3_SSE0,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_pd(t3_SSE1,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
- t1_SSE0 = _mm_mul_pd(rinv_SSE0,
- _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
- _mm_add_pd(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_pd(rinv_SSE1,
- _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
- _mm_add_pd(t2_SSE1, t3_SSE1)));
-
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
- dadx += 2;
- }
-
- /* Epilogue part, including exclusion mask */
- for (j = nj2; j < nj3; j += UNROLLJ)
- {
- jmask_SSE0 = _mm_load_pd((double *)emask0);
- jmask_SSE1 = _mm_load_pd((double *)emask1);
- emask0 += 2*UNROLLJ;
- emask1 += 2*UNROLLJ;
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_pd(x_align+j);
- jy_SSE = _mm_load_pd(y_align+j);
- jz_SSE = _mm_load_pd(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
- /* Combine masks */
- jmask_SSE0 = _mm_and_pd(jmask_SSE0, imask_SSE0);
- jmask_SSE1 = _mm_and_pd(jmask_SSE1, imask_SSE1);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_pd(rinv_SSE0, jmask_SSE0);
- rinv_SSE1 = _mm_and_pd(rinv_SSE1, jmask_SSE1);
-
- dr_SSE0 = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
- dr_SSE1 = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
-
- sk_aj_SSE = _mm_load_pd(obc_param+j);
- raj_SSE = _mm_load_pd(gb_radius+j);
-
- raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
-
- /* Evaluate influence of atom aj -> ai */
- t1_SSE0 = _mm_add_pd(dr_SSE0, sk_aj_SSE);
- t1_SSE1 = _mm_add_pd(dr_SSE1, sk_aj_SSE);
- t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
- t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
- t3_SSE0 = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
- t3_SSE1 = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
-
- obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
- obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
- obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
- obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
- obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
-
- uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
- lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
- _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
- lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
- _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
-
- dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
- uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
- uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
- lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
- lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
- diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
- lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
- sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
- sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
- prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
- logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-
- t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
- t2_SSE0 = _mm_mul_pd(diff2_SSE0,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_pd(diff2_SSE1,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
- prod_SSE1));
-
- t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
- t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
- t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
- t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
- t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
- t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
- sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-
- t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
- _mm_mul_pd(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
- _mm_mul_pd(prod_SSE1, lij3_SSE1));
- t1_SSE0 = _mm_sub_pd(t1_SSE0,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
- _mm_mul_pd(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_pd(t1_SSE1,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
- _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-
- t2_SSE0 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
- _mm_mul_pd(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
- _mm_mul_pd(uij3_SSE1, dr_SSE1)));
- t2_SSE0 = _mm_sub_pd(t2_SSE0,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
- _mm_mul_pd(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_pd(t2_SSE1,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
- _mm_mul_pd(prod_SSE1, uij3_SSE1)));
- t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
- _mm_mul_pd(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
- _mm_mul_pd(rinv_SSE1, rinv_SSE1));
- t3_SSE0 = _mm_sub_pd(t3_SSE0,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_pd(t3_SSE1,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
- t1_SSE0 = _mm_mul_pd(rinv_SSE0,
- _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
- _mm_add_pd(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_pd(rinv_SSE1,
- _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
- _mm_add_pd(t2_SSE1, t3_SSE1)));
-
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
- dadx += 2;
-
- /* Evaluate influence of atom ai -> aj */
- t1_SSE0 = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
- t1_SSE1 = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
- t2_SSE0 = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
- t2_SSE1 = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
- t3_SSE0 = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
- t3_SSE1 = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
-
- obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
- obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
- obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
- obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
- obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
-
- uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
- lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
- _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
- lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
- _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
-
- dlij_SSE0 = _mm_and_pd(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
- uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
- uij3_SSE0 = _mm_mul_pd(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_pd(uij2_SSE1, uij_SSE1);
- lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
- lij3_SSE0 = _mm_mul_pd(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
- diff2_SSE0 = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
- lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
- sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
- prod_SSE0 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
- logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
- t1_SSE0 = _mm_sub_pd(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_pd(lij_SSE1, uij_SSE1);
- t2_SSE0 = _mm_mul_pd(diff2_SSE0,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_pd(diff2_SSE1,
- _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t3_SSE0 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
- t1_SSE0 = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
- t4_SSE0 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
- t4_SSE1 = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
- t4_SSE0 = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
- t1_SSE0 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
- _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
- _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
- _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
-
- t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
- _mm_mul_pd(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
- _mm_mul_pd(prod_SSE1, lij3_SSE1));
-
- t1_SSE0 = _mm_sub_pd(t1_SSE0,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
- _mm_mul_pd(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_pd(t1_SSE1,
- _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
- _mm_mul_pd(lij3_SSE1, dr_SSE1))));
- t2_SSE0 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
- _mm_mul_pd(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_pd(onefourth_SSE,
- _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
- _mm_mul_pd(uij3_SSE1, dr_SSE1)));
- t2_SSE0 = _mm_sub_pd(t2_SSE0,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
- _mm_mul_pd(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_pd(t2_SSE1,
- _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
- _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-
- t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
- _mm_mul_pd(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
- _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-
- t3_SSE0 = _mm_sub_pd(t3_SSE0,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_pd(t3_SSE1,
- _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
- _mm_add_pd(one_SSE,
- _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
- t1_SSE0 = _mm_mul_pd(rinv_SSE0,
- _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
- _mm_add_pd(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_pd(rinv_SSE1,
- _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
- _mm_add_pd(t2_SSE1, t3_SSE1)));
-
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
- dadx += 2;
- _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
- dadx += 2;
- }
- GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1);
- sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1);
- _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
- }
-
-
- for (i = 0; i < natoms/2+1; i++)
- {
- work[i] += work[natoms+i];
- }
-
- /* Parallel summations would go here if ever implemented in DD */
-
- if (gb_algorithm == egbHCT)
- {
- /* HCT */
- for (i = 0; i < natoms; i++)
- {
- if (born->use[i] != 0)
- {
- rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
- sum_ai = 1.0/rai - work[i];
- min_rad = rai + born->gb_doffset;
- rad = 1.0/sum_ai;
-
- born->bRad[i] = rad > min_rad ? rad : min_rad;
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
- }
- }
-
- }
- else
- {
- /* OBC */
-
- /* Calculate the radii */
- for (i = 0; i < natoms; i++)
- {
-
- if (born->use[i] != 0)
- {
- rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
- rai_inv2 = 1.0/rai;
- rai = rai-born->gb_doffset;
- rai_inv = 1.0/rai;
- sum_ai = rai * work[i];
- sum_ai2 = sum_ai * sum_ai;
- sum_ai3 = sum_ai2 * sum_ai;
-
- tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
- born->bRad[i] = rai_inv - tsum*rai_inv2;
- born->bRad[i] = 1.0 / born->bRad[i];
-
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-
- tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
- born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
- }
- }
- }
-
- return 0;
-}
-
-
-
-
-
-
-
-
-int
-genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- double * x,
- double * f,
- int gb_algorithm,
- void * paadata)
-{
- gmx_allvsallgb2_data_t *aadata;
- int natoms;
- int ni0, ni1;
- int nj0, nj1, nj2, nj3;
- int i, j, k, n;
- int idx;
- int * mask;
- int * pmask0;
- int * emask0;
- int * jindex;
-
- double ix, iy, iz;
- double fix, fiy, fiz;
- double jx, jy, jz;
- double dx, dy, dz;
- double tx, ty, tz;
- double rbai, rbaj, fgb, fgb_ai, rbi;
- double * rb;
- double * dadx;
- double * x_align;
- double * y_align;
- double * z_align;
- double * fx_align;
- double * fy_align;
- double * fz_align;
- double tmpsum[2];
-
- __m128d jmask_SSE0, jmask_SSE1;
- __m128d ix_SSE0, iy_SSE0, iz_SSE0;
- __m128d ix_SSE1, iy_SSE1, iz_SSE1;
- __m128d fix_SSE0, fiy_SSE0, fiz_SSE0;
- __m128d fix_SSE1, fiy_SSE1, fiz_SSE1;
- __m128d rbai_SSE0, rbai_SSE1;
- __m128d imask_SSE0, imask_SSE1;
- __m128d jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
- __m128d dx_SSE0, dy_SSE0, dz_SSE0;
- __m128d dx_SSE1, dy_SSE1, dz_SSE1;
- __m128d fgb_SSE0, fgb_ai_SSE0;
- __m128d fgb_SSE1, fgb_ai_SSE1;
- __m128d tx_SSE0, ty_SSE0, tz_SSE0;
- __m128d tx_SSE1, ty_SSE1, tz_SSE1;
- __m128d t1, t2, tmpSSE;
-
- natoms = mdatoms->nr;
- ni0 = 0;
- ni1 = mdatoms->homenr;
-
- aadata = (gmx_allvsallgb2_data_t *)paadata;
-
- x_align = aadata->x_align;
- y_align = aadata->y_align;
- z_align = aadata->z_align;
- fx_align = aadata->fx_align;
- fy_align = aadata->fy_align;
- fz_align = aadata->fz_align;
-
- jindex = aadata->jindex_gb;
- dadx = fr->dadx;
-
- n = 0;
- rb = aadata->work;
-
- /* Loop to get the proper form for the Born radius term */
- if (gb_algorithm == egbSTILL)
- {
- for (i = 0; i < natoms; i++)
- {
- rbi = born->bRad[i];
- rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
- }
- }
- else if (gb_algorithm == egbHCT)
- {
- for (i = 0; i < natoms; i++)
- {
- rbi = born->bRad[i];
- rb[i] = rbi * rbi * fr->dvda[i];
- }
- }
- else if (gb_algorithm == egbOBC)
- {
- for (idx = 0; idx < natoms; idx++)
- {
- rbi = born->bRad[idx];
- rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
- }
- }
-
- for (i = 0; i < 2*natoms; i++)
- {
- fx_align[i] = 0;
- fy_align[i] = 0;
- fz_align[i] = 0;
- }
-
-
- for (i = 0; i < natoms; i++)
- {
- rb[i+natoms] = rb[i];
- }
-
- for (i = ni0; i < ni1; i += UNROLLI)
- {
- /* We assume shifts are NOT used for all-vs-all interactions */
-
- /* Load i atom data */
- ix_SSE0 = _mm_load1_pd(x_align+i);
- iy_SSE0 = _mm_load1_pd(y_align+i);
- iz_SSE0 = _mm_load1_pd(z_align+i);
- ix_SSE1 = _mm_load1_pd(x_align+i+1);
- iy_SSE1 = _mm_load1_pd(y_align+i+1);
- iz_SSE1 = _mm_load1_pd(z_align+i+1);
-
- fix_SSE0 = _mm_setzero_pd();
- fiy_SSE0 = _mm_setzero_pd();
- fiz_SSE0 = _mm_setzero_pd();
- fix_SSE1 = _mm_setzero_pd();
- fiy_SSE1 = _mm_setzero_pd();
- fiz_SSE1 = _mm_setzero_pd();
-
- rbai_SSE0 = _mm_load1_pd(rb+i);
- rbai_SSE1 = _mm_load1_pd(rb+i+1);
-
- /* Load limits for loop over neighbors */
- nj0 = jindex[4*i];
- nj3 = jindex[4*i+3];
-
- /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
- for (j = nj0; j < nj3; j += UNROLLJ)
- {
- /* load j atom coordinates */
- jx_SSE = _mm_load_pd(x_align+j);
- jy_SSE = _mm_load_pd(y_align+j);
- jz_SSE = _mm_load_pd(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_pd(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_pd(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_pd(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_pd(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_pd(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_pd(iz_SSE1, jz_SSE);
-
- rbaj_SSE = _mm_load_pd(rb+j);
-
- fgb_SSE0 = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx));
- dadx += 2;
- fgb_SSE1 = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx));
- dadx += 2;
-
- fgb_ai_SSE0 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
- dadx += 2;
- fgb_ai_SSE1 = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
- dadx += 2;
-
- /* Total force between ai and aj is the sum of ai->aj and aj->ai */
- fgb_SSE0 = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0);
- fgb_SSE1 = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1);
-
- /* Calculate temporary vectorial force */
- tx_SSE0 = _mm_mul_pd(fgb_SSE0, dx_SSE0);
- ty_SSE0 = _mm_mul_pd(fgb_SSE0, dy_SSE0);
- tz_SSE0 = _mm_mul_pd(fgb_SSE0, dz_SSE0);
- tx_SSE1 = _mm_mul_pd(fgb_SSE1, dx_SSE1);
- ty_SSE1 = _mm_mul_pd(fgb_SSE1, dy_SSE1);
- tz_SSE1 = _mm_mul_pd(fgb_SSE1, dz_SSE1);
-
- /* Increment i atom force */
- fix_SSE0 = _mm_add_pd(fix_SSE0, tx_SSE0);
- fiy_SSE0 = _mm_add_pd(fiy_SSE0, ty_SSE0);
- fiz_SSE0 = _mm_add_pd(fiz_SSE0, tz_SSE0);
- fix_SSE1 = _mm_add_pd(fix_SSE1, tx_SSE1);
- fiy_SSE1 = _mm_add_pd(fiy_SSE1, ty_SSE1);
- fiz_SSE1 = _mm_add_pd(fiz_SSE1, tz_SSE1);
-
- /* Decrement j atom force */
- _mm_store_pd(fx_align+j,
- _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) ));
- _mm_store_pd(fy_align+j,
- _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) ));
- _mm_store_pd(fz_align+j,
- _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) ));
- }
-
- /* Add i forces to mem */
- GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1);
- fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1);
- _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
-
- GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1);
- fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1);
- _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
-
- GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1);
- fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1);
- _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
- }
-
- for (i = 0; i < natoms; i++)
- {
- f[3*i] += fx_align[i] + fx_align[natoms+i];
- f[3*i+1] += fy_align[i] + fy_align[natoms+i];
- f[3*i+2] += fz_align[i] + fz_align[natoms+i];
- }
-
- return 0;
-}
-
-#else
-/* dummy variable when not using SSE */
-int genborn_allvsall_sse2_double_dummy;
-
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2010,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _GENBORN_ALLVSALL_SSE2_DOUBLE_H
-#define _GENBORN_ALLVSALL_SSE2_DOUBLE_H
-
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/legacyheaders/types/simple.h"
-
-int
-genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- gmx_localtop_t * top,
- double * x,
- t_commrec * cr,
- void * work);
-
-int
-genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- int gb_algorithm,
- gmx_localtop_t * top,
- double * x,
- t_commrec * cr,
- void * work);
-
-int
-genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- double * x,
- double * f,
- int gb_algorithm,
- void * work);
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2012,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include <math.h>
-
-#include "gromacs/legacyheaders/genborn.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/types/simple.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/genborn_allvsall.h"
-#include "gromacs/utility/smalloc.h"
-
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-
-#include <gmx_sse2_single.h>
-
-
-#define SIMD_WIDTH 4
-#define UNROLLI 4
-#define UNROLLJ 4
-
-
-
-
-
-
-
-
-
-typedef struct
-{
- int * jindex_gb;
- int ** prologue_mask_gb;
- int ** epilogue_mask;
- int * imask;
- real * gb_radius;
- real * workparam;
- real * work;
- real * x_align;
- real * y_align;
- real * z_align;
- real * fx_align;
- real * fy_align;
- real * fz_align;
-}
-gmx_allvsallgb2_data_t;
-
-
-static int
-calc_maxoffset(int i, int natoms)
-{
- int maxoffset;
-
- if ((natoms % 2) == 1)
- {
- /* Odd number of atoms, easy */
- maxoffset = natoms/2;
- }
- else if ((natoms % 4) == 0)
- {
- /* Multiple of four is hard */
- if (i < natoms/2)
- {
- if ((i % 2) == 0)
- {
- maxoffset = natoms/2;
- }
- else
- {
- maxoffset = natoms/2-1;
- }
- }
- else
- {
- if ((i % 2) == 1)
- {
- maxoffset = natoms/2;
- }
- else
- {
- maxoffset = natoms/2-1;
- }
- }
- }
- else
- {
- /* natoms/2 = odd */
- if ((i % 2) == 0)
- {
- maxoffset = natoms/2;
- }
- else
- {
- maxoffset = natoms/2-1;
- }
- }
-
- return maxoffset;
-}
-
-static void
-setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
- t_ilist * ilist,
- int start,
- int end,
- int natoms,
- gmx_bool bInclude12,
- gmx_bool bInclude13,
- gmx_bool bInclude14)
-{
- int i, j, k, tp;
- int a1, a2;
- int ni0, ni1, nj0, nj1, nj;
- int imin, imax, iexcl;
- int max_offset;
- int max_excl_offset;
- int firstinteraction;
- int ibase;
- int *pi;
-
- /* This routine can appear to be a bit complex, but it is mostly book-keeping.
- * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
- * whether they should interact or not.
- *
- * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
- * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
- * we create a jindex array with three elements per i atom: the starting point, the point to
- * which we need to check exclusions, and the end point.
- * This way we only have to allocate a short exclusion mask per i atom.
- */
-
- ni0 = (start/UNROLLI)*UNROLLI;
- ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
-
- /* Set the interaction mask to only enable the i atoms we want to include */
- snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
- aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
- for (i = 0; i < natoms+UNROLLI; i++)
- {
- aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
- }
-
- /* Allocate memory for our modified jindex array */
- snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
- for (i = 0; i < 4*(natoms+UNROLLI); i++)
- {
- aadata->jindex_gb[i] = 0;
- }
-
- /* Create the exclusion masks for the prologue part */
- snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
-
- /* First zero everything to avoid uninitialized data */
- for (i = 0; i < natoms+UNROLLI; i++)
- {
- aadata->prologue_mask_gb[i] = NULL;
- }
-
- /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
- for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
- {
- max_excl_offset = -1;
-
- /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
- imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
-
- /* Which atom is the first we (might) interact with? */
- imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
- for (i = ibase; i < imax; i++)
- {
- /* Before exclusions, which atom is the first we (might) interact with? */
- firstinteraction = i+1;
- max_offset = calc_maxoffset(i, natoms);
-
- if (!bInclude12)
- {
- for (j = 0; j < ilist[F_GB12].nr; j += 3)
- {
- a1 = ilist[F_GB12].iatoms[j+1];
- a2 = ilist[F_GB12].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k == firstinteraction)
- {
- firstinteraction++;
- }
- }
- }
- if (!bInclude13)
- {
- for (j = 0; j < ilist[F_GB13].nr; j += 3)
- {
- a1 = ilist[F_GB13].iatoms[j+1];
- a2 = ilist[F_GB13].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k == firstinteraction)
- {
- firstinteraction++;
- }
- }
- }
- if (!bInclude14)
- {
- for (j = 0; j < ilist[F_GB14].nr; j += 3)
- {
- a1 = ilist[F_GB14].iatoms[j+1];
- a2 = ilist[F_GB14].iatoms[j+2];
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k == firstinteraction)
- {
- firstinteraction++;
- }
- }
- }
- imin = (firstinteraction < imin) ? firstinteraction : imin;
- }
- /* round down to j unrolling factor */
- imin = (imin/UNROLLJ)*UNROLLJ;
-
- for (i = ibase; i < imax; i++)
- {
- max_offset = calc_maxoffset(i, natoms);
-
- if (!bInclude12)
- {
- for (j = 0; j < ilist[F_GB12].nr; j += 3)
- {
- a1 = ilist[F_GB12].iatoms[j+1];
- a2 = ilist[F_GB12].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k < imin)
- {
- k += natoms;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
-
- k = k - imin;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
- max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
- }
- }
- if (!bInclude13)
- {
- for (j = 0; j < ilist[F_GB13].nr; j += 3)
- {
- a1 = ilist[F_GB13].iatoms[j+1];
- a2 = ilist[F_GB13].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k < imin)
- {
- k += natoms;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
-
- k = k - imin;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
- max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
- }
- }
- if (!bInclude14)
- {
- for (j = 0; j < ilist[F_GB14].nr; j += 3)
- {
- a1 = ilist[F_GB14].iatoms[j+1];
- a2 = ilist[F_GB14].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k < imin)
- {
- k += natoms;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
-
- k = k - imin;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
- max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
- }
- }
- }
-
- /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
- max_excl_offset++;
- /* round up to j unrolling factor */
- max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
-
- /* Set all the prologue masks length to this value (even for i>end) */
- for (i = ibase; i < ibase+UNROLLI; i++)
- {
- aadata->jindex_gb[4*i] = imin;
- aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
- }
- }
-
- /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
- for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
- {
- for (i = ibase; i < ibase+UNROLLI; i++)
- {
- nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
- imin = aadata->jindex_gb[4*i];
-
- /* Allocate aligned memory */
- snew(pi, nj+2*SIMD_WIDTH);
- aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-
- max_offset = calc_maxoffset(i, natoms);
-
- /* Include interactions i+1 <= j < i+maxoffset */
- for (k = 0; k < nj; k++)
- {
- j = imin + k;
-
- if ( (j > i) && (j <= i+max_offset) )
- {
- aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
- }
- else
- {
- aadata->prologue_mask_gb[i][k] = 0;
- }
- }
-
- /* Clear out the explicit exclusions */
- if (i < end)
- {
- if (!bInclude12)
- {
- for (j = 0; j < ilist[F_GB12].nr; j += 3)
- {
- a1 = ilist[F_GB12].iatoms[j+1];
- a2 = ilist[F_GB12].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
- k = k-i;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
-
- k = k+i-imin;
- if (k >= 0)
- {
- aadata->prologue_mask_gb[i][k] = 0;
- }
- }
- }
- if (!bInclude13)
- {
- for (j = 0; j < ilist[F_GB13].nr; j += 3)
- {
- a1 = ilist[F_GB13].iatoms[j+1];
- a2 = ilist[F_GB13].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
- k = k-i;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
-
- k = k+i-imin;
- if (k >= 0)
- {
- aadata->prologue_mask_gb[i][k] = 0;
- }
- }
- }
- if (!bInclude14)
- {
- for (j = 0; j < ilist[F_GB14].nr; j += 3)
- {
- a1 = ilist[F_GB14].iatoms[j+1];
- a2 = ilist[F_GB14].iatoms[j+2];
-
- if (a1 == i)
- {
- k = a2;
- }
- else if (a2 == i)
- {
- k = a1;
- }
- else
- {
- continue;
- }
-
- if (k > i+max_offset)
- {
- continue;
- }
- k = k-i;
-
- if (k+natoms <= max_offset)
- {
- k += natoms;
- }
-
- k = k+i-imin;
- if (k >= 0)
- {
- aadata->prologue_mask_gb[i][k] = 0;
- }
- }
- }
- }
- }
- }
-
- /* Construct the epilogue mask - this just contains the check for maxoffset */
- snew(aadata->epilogue_mask, natoms+UNROLLI);
-
- /* First zero everything to avoid uninitialized data */
- for (i = 0; i < natoms+UNROLLI; i++)
- {
- aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
- aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
- aadata->epilogue_mask[i] = NULL;
- }
-
- for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
- {
- /* Find the lowest index for which we need to use the epilogue */
- imin = ibase;
- max_offset = calc_maxoffset(imin, natoms);
-
- imin = imin + 1 + max_offset;
-
- /* Find largest index for which we need to use the epilogue */
- imax = ibase + UNROLLI-1;
- imax = (imax < end) ? imax : end;
-
- max_offset = calc_maxoffset(imax, natoms);
- imax = imax + 1 + max_offset + UNROLLJ - 1;
-
- for (i = ibase; i < ibase+UNROLLI; i++)
- {
- /* Start of epilogue - round down to j tile limit */
- aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
- /* Make sure we dont overlap - for small systems everything is done in the prologue */
- aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
- /* Round upwards to j tile limit */
- aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
- /* Make sure we dont have a negative range for the epilogue */
- aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
- }
- }
-
- /* And fill it with data... */
-
- for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
- {
- for (i = ibase; i < ibase+UNROLLI; i++)
- {
-
- nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
-
- /* Allocate aligned memory */
- snew(pi, nj+2*SIMD_WIDTH);
- aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-
- max_offset = calc_maxoffset(i, natoms);
-
- for (k = 0; k < nj; k++)
- {
- j = aadata->jindex_gb[4*i+2] + k;
- aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
- }
- }
- }
-}
-
-
-static void
-genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
- gmx_localtop_t * top,
- gmx_genborn_t * born,
- t_mdatoms * mdatoms,
- real radius_offset,
- int gb_algorithm,
- gmx_bool bInclude12,
- gmx_bool bInclude13,
- gmx_bool bInclude14)
-{
- int i, j, idx;
- int natoms;
- gmx_allvsallgb2_data_t *aadata;
- real *p;
-
- natoms = mdatoms->nr;
-
- snew(aadata, 1);
- *p_aadata = aadata;
-
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
- snew(p, 2*natoms+2*SIMD_WIDTH);
- aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
- snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
- aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
- snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
- aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
- snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
- aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
- for (i = 0; i < mdatoms->nr; i++)
- {
- aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
- if (gb_algorithm == egbSTILL)
- {
- aadata->workparam[i] = born->vsolv[i];
- }
- else if (gb_algorithm == egbOBC)
- {
- aadata->workparam[i] = born->param[i];
- }
- aadata->work[i] = 0.0;
- }
- for (i = 0; i < mdatoms->nr; i++)
- {
- aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
- aadata->workparam[natoms+i] = aadata->workparam[i];
- aadata->work[natoms+i] = aadata->work[i];
- }
-
- for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
- {
- aadata->x_align[i] = 0.0;
- aadata->y_align[i] = 0.0;
- aadata->z_align[i] = 0.0;
- aadata->fx_align[i] = 0.0;
- aadata->fy_align[i] = 0.0;
- aadata->fz_align[i] = 0.0;
- }
-
- setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
- bInclude12, bInclude13, bInclude14);
-}
-
-
-int
-genborn_allvsall_calc_still_radii_sse2_single(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- gmx_localtop_t * top,
- real * x,
- t_commrec * cr,
- void * paadata)
-{
- gmx_allvsallgb2_data_t *aadata;
- int natoms;
- int ni0, ni1;
- int nj0, nj1, nj2, nj3;
- int i, j, k, n;
- int * mask;
- int * pmask0;
- int * pmask1;
- int * pmask2;
- int * pmask3;
- int * emask0;
- int * emask1;
- int * emask2;
- int * emask3;
- real ix, iy, iz;
- real jx, jy, jz;
- real dx, dy, dz;
- real rsq, rinv;
- real gpi, rai, vai;
- real prod_ai;
- real irsq, idr4, idr6;
- real raj, rvdw, ratio;
- real vaj, ccf, dccf, theta, cosq;
- real term, prod, icf4, icf6, gpi2, factor, sinq;
- real * gb_radius;
- real * vsolv;
- real * work;
- real tmpsum[4];
- real * x_align;
- real * y_align;
- real * z_align;
- int * jindex;
- real * dadx;
-
- __m128 ix_SSE0, iy_SSE0, iz_SSE0;
- __m128 ix_SSE1, iy_SSE1, iz_SSE1;
- __m128 ix_SSE2, iy_SSE2, iz_SSE2;
- __m128 ix_SSE3, iy_SSE3, iz_SSE3;
- __m128 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
- __m128 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
- __m128 gpi_SSE2, rai_SSE2, prod_ai_SSE2;
- __m128 gpi_SSE3, rai_SSE3, prod_ai_SSE3;
- __m128 imask_SSE0, jmask_SSE0;
- __m128 imask_SSE1, jmask_SSE1;
- __m128 imask_SSE2, jmask_SSE2;
- __m128 imask_SSE3, jmask_SSE3;
- __m128 jx_SSE, jy_SSE, jz_SSE;
- __m128 dx_SSE0, dy_SSE0, dz_SSE0;
- __m128 dx_SSE1, dy_SSE1, dz_SSE1;
- __m128 dx_SSE2, dy_SSE2, dz_SSE2;
- __m128 dx_SSE3, dy_SSE3, dz_SSE3;
- __m128 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
- __m128 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
- __m128 rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
- __m128 rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
- __m128 raj_SSE, vaj_SSE, prod_SSE;
- __m128 rvdw_SSE0, ratio_SSE0;
- __m128 rvdw_SSE1, ratio_SSE1;
- __m128 rvdw_SSE2, ratio_SSE2;
- __m128 rvdw_SSE3, ratio_SSE3;
- __m128 theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
- __m128 theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
- __m128 theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
- __m128 theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
- __m128 ccf_SSE0, dccf_SSE0;
- __m128 ccf_SSE1, dccf_SSE1;
- __m128 ccf_SSE2, dccf_SSE2;
- __m128 ccf_SSE3, dccf_SSE3;
- __m128 icf4_SSE0, icf6_SSE0;
- __m128 icf4_SSE1, icf6_SSE1;
- __m128 icf4_SSE2, icf6_SSE2;
- __m128 icf4_SSE3, icf6_SSE3;
- __m128 half_SSE, one_SSE, two_SSE, four_SSE;
- __m128 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
-
- natoms = mdatoms->nr;
- ni0 = 0;
- ni1 = mdatoms->homenr;
-
- n = 0;
-
- aadata = *((gmx_allvsallgb2_data_t **)paadata);
-
-
- if (aadata == NULL)
- {
- genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
- egbSTILL, FALSE, FALSE, TRUE);
- *((gmx_allvsallgb2_data_t **)paadata) = aadata;
- }
-
- x_align = aadata->x_align;
- y_align = aadata->y_align;
- z_align = aadata->z_align;
-
- gb_radius = aadata->gb_radius;
- vsolv = aadata->workparam;
- work = aadata->work;
- jindex = aadata->jindex_gb;
- dadx = fr->dadx;
-
- still_p4_SSE = _mm_set1_ps(STILL_P4);
- still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
- still_pip5_SSE = _mm_set1_ps(STILL_PIP5);
- half_SSE = _mm_set1_ps(0.5);
- one_SSE = _mm_set1_ps(1.0);
- two_SSE = _mm_set1_ps(2.0);
- four_SSE = _mm_set1_ps(4.0);
-
- /* This will be summed, so it has to extend to natoms + buffer */
- for (i = 0; i < natoms+1+natoms/2; i++)
- {
- work[i] = 0;
- }
-
- for (i = ni0; i < ni1+1+natoms/2; i++)
- {
- k = i%natoms;
- x_align[i] = x[3*k];
- y_align[i] = x[3*k+1];
- z_align[i] = x[3*k+2];
- work[i] = 0;
- }
-
-
- for (i = ni0; i < ni1; i += UNROLLI)
- {
- /* We assume shifts are NOT used for all-vs-all interactions */
-
- /* Load i atom data */
- ix_SSE0 = _mm_load1_ps(x_align+i);
- iy_SSE0 = _mm_load1_ps(y_align+i);
- iz_SSE0 = _mm_load1_ps(z_align+i);
- ix_SSE1 = _mm_load1_ps(x_align+i+1);
- iy_SSE1 = _mm_load1_ps(y_align+i+1);
- iz_SSE1 = _mm_load1_ps(z_align+i+1);
- ix_SSE2 = _mm_load1_ps(x_align+i+2);
- iy_SSE2 = _mm_load1_ps(y_align+i+2);
- iz_SSE2 = _mm_load1_ps(z_align+i+2);
- ix_SSE3 = _mm_load1_ps(x_align+i+3);
- iy_SSE3 = _mm_load1_ps(y_align+i+3);
- iz_SSE3 = _mm_load1_ps(z_align+i+3);
-
- gpi_SSE0 = _mm_setzero_ps();
- gpi_SSE1 = _mm_setzero_ps();
- gpi_SSE2 = _mm_setzero_ps();
- gpi_SSE3 = _mm_setzero_ps();
-
- rai_SSE0 = _mm_load1_ps(gb_radius+i);
- rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
- rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
- rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
-
- prod_ai_SSE0 = _mm_set1_ps(STILL_P4*vsolv[i]);
- prod_ai_SSE1 = _mm_set1_ps(STILL_P4*vsolv[i+1]);
- prod_ai_SSE2 = _mm_set1_ps(STILL_P4*vsolv[i+2]);
- prod_ai_SSE3 = _mm_set1_ps(STILL_P4*vsolv[i+3]);
-
- /* Load limits for loop over neighbors */
- nj0 = jindex[4*i];
- nj1 = jindex[4*i+1];
- nj2 = jindex[4*i+2];
- nj3 = jindex[4*i+3];
-
- pmask0 = aadata->prologue_mask_gb[i];
- pmask1 = aadata->prologue_mask_gb[i+1];
- pmask2 = aadata->prologue_mask_gb[i+2];
- pmask3 = aadata->prologue_mask_gb[i+3];
- emask0 = aadata->epilogue_mask[i];
- emask1 = aadata->epilogue_mask[i+1];
- emask2 = aadata->epilogue_mask[i+2];
- emask3 = aadata->epilogue_mask[i+3];
-
- imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
- imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
- imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
- imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
-
- /* Prologue part, including exclusion mask */
- for (j = nj0; j < nj1; j += UNROLLJ)
- {
- jmask_SSE0 = _mm_load_ps((real *)pmask0);
- jmask_SSE1 = _mm_load_ps((real *)pmask1);
- jmask_SSE2 = _mm_load_ps((real *)pmask2);
- jmask_SSE3 = _mm_load_ps((real *)pmask3);
- pmask0 += UNROLLJ;
- pmask1 += UNROLLJ;
- pmask2 += UNROLLJ;
- pmask3 += UNROLLJ;
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_ps(x_align+j);
- jy_SSE = _mm_load_ps(y_align+j);
- jz_SSE = _mm_load_ps(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
- dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
- dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
- dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
- dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
- dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
- dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
- /* Combine masks */
- jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
- jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
- jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
- jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
- rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
- rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
- rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
- rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
- rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
-
- irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
- irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
- irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
- irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
- idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
- idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
- idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
- idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
- idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
- idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
- idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
- idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
-
- raj_SSE = _mm_load_ps(gb_radius+j);
- vaj_SSE = _mm_load_ps(vsolv+j);
-
- rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
- rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
- rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
- rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
-
- ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
- ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
- ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
- ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
-
- ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
- ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
- ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
- ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
- theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
- theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
- theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
- theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
- gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
- gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
- gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
- gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
- term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
- term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
- term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
- term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
- ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
- ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
- ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
- ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
- dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
- _mm_mul_ps(sinq_SSE0, theta_SSE0));
- dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
- _mm_mul_ps(sinq_SSE1, theta_SSE1));
- dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
- _mm_mul_ps(sinq_SSE2, theta_SSE2));
- dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
- _mm_mul_ps(sinq_SSE3, theta_SSE3));
-
- prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE);
- icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
- icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
- icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
- icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
- icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
- icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
- icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
- icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
-
- _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
- gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
- _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
- _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
- _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
-
- gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
- gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
- gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
- gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
-
- /* Save ai->aj and aj->ai chain rule terms */
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
- dadx += 4;
-
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
- dadx += 4;
- }
-
- /* Main part, no exclusions */
- for (j = nj1; j < nj2; j += UNROLLJ)
- {
- /* load j atom coordinates */
- jx_SSE = _mm_load_ps(x_align+j);
- jy_SSE = _mm_load_ps(y_align+j);
- jz_SSE = _mm_load_ps(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
- dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
- dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
- dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
- dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
- dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
- dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
- rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
- rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_ps(rinv_SSE0, imask_SSE0);
- rinv_SSE1 = _mm_and_ps(rinv_SSE1, imask_SSE1);
- rinv_SSE2 = _mm_and_ps(rinv_SSE2, imask_SSE2);
- rinv_SSE3 = _mm_and_ps(rinv_SSE3, imask_SSE3);
-
- irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
- irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
- irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
- irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
- idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
- idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
- idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
- idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
- idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
- idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
- idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
- idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
-
- raj_SSE = _mm_load_ps(gb_radius+j);
-
- rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
- rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
- rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
- rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
- vaj_SSE = _mm_load_ps(vsolv+j);
-
- ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
- ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
- ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
- ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
-
- ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
- ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
- ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
- ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
- theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
- theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
- theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
- theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
- gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
- gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
- gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
- gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
- term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
- term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
- term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
- term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
- ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
- ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
- ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
- ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
- dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
- _mm_mul_ps(sinq_SSE0, theta_SSE0));
- dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
- _mm_mul_ps(sinq_SSE1, theta_SSE1));
- dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
- _mm_mul_ps(sinq_SSE2, theta_SSE2));
- dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
- _mm_mul_ps(sinq_SSE3, theta_SSE3));
-
- prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE );
- icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
- icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
- icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
- icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
- icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
- icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
- icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
- icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
-
- _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
- gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
- _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
- _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
- _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
-
- gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
- gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
- gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
- gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
-
- /* Save ai->aj and aj->ai chain rule terms */
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
- dadx += 4;
-
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
- dadx += 4;
- }
- /* Epilogue part, including exclusion mask */
- for (j = nj2; j < nj3; j += UNROLLJ)
- {
- jmask_SSE0 = _mm_load_ps((real *)emask0);
- jmask_SSE1 = _mm_load_ps((real *)emask1);
- jmask_SSE2 = _mm_load_ps((real *)emask2);
- jmask_SSE3 = _mm_load_ps((real *)emask3);
- emask0 += UNROLLJ;
- emask1 += UNROLLJ;
- emask2 += UNROLLJ;
- emask3 += UNROLLJ;
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_ps(x_align+j);
- jy_SSE = _mm_load_ps(y_align+j);
- jz_SSE = _mm_load_ps(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
- dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
- dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
- dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
- dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
- dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
- dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
- /* Combine masks */
- jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
- jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
- jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
- jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
- rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
- rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
- rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
- rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
- rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
-
- irsq_SSE0 = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
- irsq_SSE1 = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
- irsq_SSE2 = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
- irsq_SSE3 = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
- idr4_SSE0 = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
- idr4_SSE1 = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
- idr4_SSE2 = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
- idr4_SSE3 = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
- idr6_SSE0 = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
- idr6_SSE1 = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
- idr6_SSE2 = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
- idr6_SSE3 = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
-
- raj_SSE = _mm_load_ps(gb_radius+j);
- vaj_SSE = _mm_load_ps(vsolv+j);
-
- rvdw_SSE0 = _mm_add_ps(rai_SSE0, raj_SSE);
- rvdw_SSE1 = _mm_add_ps(rai_SSE1, raj_SSE);
- rvdw_SSE2 = _mm_add_ps(rai_SSE2, raj_SSE);
- rvdw_SSE3 = _mm_add_ps(rai_SSE3, raj_SSE);
-
- ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
- ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
- ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
- ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
-
- ratio_SSE0 = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
- ratio_SSE1 = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
- ratio_SSE2 = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
- ratio_SSE3 = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
- theta_SSE0 = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
- theta_SSE1 = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
- theta_SSE2 = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
- theta_SSE3 = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
- gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
- gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
- gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
- gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
- term_SSE0 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
- term_SSE1 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
- term_SSE2 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
- term_SSE3 = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
- ccf_SSE0 = _mm_mul_ps(term_SSE0, term_SSE0);
- ccf_SSE1 = _mm_mul_ps(term_SSE1, term_SSE1);
- ccf_SSE2 = _mm_mul_ps(term_SSE2, term_SSE2);
- ccf_SSE3 = _mm_mul_ps(term_SSE3, term_SSE3);
- dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
- _mm_mul_ps(sinq_SSE0, theta_SSE0));
- dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
- _mm_mul_ps(sinq_SSE1, theta_SSE1));
- dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
- _mm_mul_ps(sinq_SSE2, theta_SSE2));
- dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
- _mm_mul_ps(sinq_SSE3, theta_SSE3));
-
- prod_SSE = _mm_mul_ps(still_p4_SSE, vaj_SSE);
- icf4_SSE0 = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
- icf4_SSE1 = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
- icf4_SSE2 = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
- icf4_SSE3 = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
- icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
- icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
- icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
- icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
-
- _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
- gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
- _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
- _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
- _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
-
- gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
- gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
- gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
- gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
-
- /* Save ai->aj and aj->ai chain rule terms */
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
- dadx += 4;
-
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
- dadx += 4;
- }
- _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
- gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
- gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
- gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
- _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
- }
-
- /* In case we have written anything beyond natoms, move it back.
- * Never mind that we leave stuff above natoms; that will not
- * be accessed later in the routine.
- * In principle this should be a move rather than sum, but this
- * way we dont have to worry about even/odd offsets...
- */
- for (i = natoms; i < ni1+1+natoms/2; i++)
- {
- work[i-natoms] += work[i];
- }
-
- /* Parallel summations would go here if ever implemented with DD */
-
- factor = 0.5 * ONE_4PI_EPS0;
- /* Calculate the radii - should we do all atoms, or just our local ones? */
- for (i = 0; i < natoms; i++)
- {
- if (born->use[i] != 0)
- {
- gpi = born->gpol[i]+work[i];
- gpi2 = gpi * gpi;
- born->bRad[i] = factor*gmx_invsqrt(gpi2);
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
- }
- }
-
- return 0;
-}
-
-
-
-int
-genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- int gb_algorithm,
- gmx_localtop_t * top,
- real * x,
- t_commrec * cr,
- void * paadata)
-{
- gmx_allvsallgb2_data_t *aadata;
- int natoms;
- int ni0, ni1;
- int nj0, nj1, nj2, nj3;
- int i, j, k, n;
- int * mask;
- int * pmask0;
- int * pmask1;
- int * pmask2;
- int * pmask3;
- int * emask0;
- int * emask1;
- int * emask2;
- int * emask3;
- real * gb_radius;
- real * vsolv;
- real * work;
- real tmpsum[4];
- real * x_align;
- real * y_align;
- real * z_align;
- int * jindex;
- real * dadx;
- real * obc_param;
- real rad, min_rad;
- real rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
-
- __m128 ix_SSE0, iy_SSE0, iz_SSE0;
- __m128 ix_SSE1, iy_SSE1, iz_SSE1;
- __m128 ix_SSE2, iy_SSE2, iz_SSE2;
- __m128 ix_SSE3, iy_SSE3, iz_SSE3;
- __m128 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
- __m128 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
- __m128 gpi_SSE2, rai_SSE2, prod_ai_SSE2;
- __m128 gpi_SSE3, rai_SSE3, prod_ai_SSE3;
- __m128 imask_SSE0, jmask_SSE0;
- __m128 imask_SSE1, jmask_SSE1;
- __m128 imask_SSE2, jmask_SSE2;
- __m128 imask_SSE3, jmask_SSE3;
- __m128 jx_SSE, jy_SSE, jz_SSE;
- __m128 dx_SSE0, dy_SSE0, dz_SSE0;
- __m128 dx_SSE1, dy_SSE1, dz_SSE1;
- __m128 dx_SSE2, dy_SSE2, dz_SSE2;
- __m128 dx_SSE3, dy_SSE3, dz_SSE3;
- __m128 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
- __m128 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
- __m128 rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
- __m128 rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
- __m128 raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
- __m128 ccf_SSE0, dccf_SSE0, prod_SSE0;
- __m128 ccf_SSE1, dccf_SSE1, prod_SSE1;
- __m128 ccf_SSE2, dccf_SSE2, prod_SSE2;
- __m128 ccf_SSE3, dccf_SSE3, prod_SSE3;
- __m128 icf4_SSE0, icf6_SSE0;
- __m128 icf4_SSE1, icf6_SSE1;
- __m128 icf4_SSE2, icf6_SSE2;
- __m128 icf4_SSE3, icf6_SSE3;
- __m128 oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
- __m128 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
- __m128 rai_inv_SSE0;
- __m128 rai_inv_SSE1;
- __m128 rai_inv_SSE2;
- __m128 rai_inv_SSE3;
- __m128 sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
- __m128 sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
- __m128 sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
- __m128 sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
- __m128 lij_inv_SSE0, sk2_rinv_SSE0;
- __m128 lij_inv_SSE1, sk2_rinv_SSE1;
- __m128 lij_inv_SSE2, sk2_rinv_SSE2;
- __m128 lij_inv_SSE3, sk2_rinv_SSE3;
- __m128 dr_SSE0;
- __m128 dr_SSE1;
- __m128 dr_SSE2;
- __m128 dr_SSE3;
- __m128 t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
- __m128 t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
- __m128 t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
- __m128 t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
- __m128 obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
- __m128 obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
- __m128 obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
- __m128 obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
- __m128 uij_SSE0, uij2_SSE0, uij3_SSE0;
- __m128 uij_SSE1, uij2_SSE1, uij3_SSE1;
- __m128 uij_SSE2, uij2_SSE2, uij3_SSE2;
- __m128 uij_SSE3, uij2_SSE3, uij3_SSE3;
- __m128 lij_SSE0, lij2_SSE0, lij3_SSE0;
- __m128 lij_SSE1, lij2_SSE1, lij3_SSE1;
- __m128 lij_SSE2, lij2_SSE2, lij3_SSE2;
- __m128 lij_SSE3, lij2_SSE3, lij3_SSE3;
- __m128 dlij_SSE0, diff2_SSE0, logterm_SSE0;
- __m128 dlij_SSE1, diff2_SSE1, logterm_SSE1;
- __m128 dlij_SSE2, diff2_SSE2, logterm_SSE2;
- __m128 dlij_SSE3, diff2_SSE3, logterm_SSE3;
- __m128 doffset_SSE;
-
- natoms = mdatoms->nr;
- ni0 = 0;
- ni1 = mdatoms->homenr;
-
- n = 0;
-
- aadata = *((gmx_allvsallgb2_data_t **)paadata);
-
-
- if (aadata == NULL)
- {
- genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
- egbOBC, TRUE, TRUE, TRUE);
- *((gmx_allvsallgb2_data_t **)paadata) = aadata;
- }
-
- x_align = aadata->x_align;
- y_align = aadata->y_align;
- z_align = aadata->z_align;
-
- gb_radius = aadata->gb_radius;
- work = aadata->work;
- jindex = aadata->jindex_gb;
- dadx = fr->dadx;
- obc_param = aadata->workparam;
-
- oneeighth_SSE = _mm_set1_ps(0.125);
- onefourth_SSE = _mm_set1_ps(0.25);
- half_SSE = _mm_set1_ps(0.5);
- one_SSE = _mm_set1_ps(1.0);
- two_SSE = _mm_set1_ps(2.0);
- four_SSE = _mm_set1_ps(4.0);
- doffset_SSE = _mm_set1_ps(born->gb_doffset);
-
- for (i = 0; i < natoms; i++)
- {
- x_align[i] = x[3*i];
- y_align[i] = x[3*i+1];
- z_align[i] = x[3*i+2];
- }
-
- /* Copy again */
- for (i = 0; i < natoms/2+1; i++)
- {
- x_align[natoms+i] = x_align[i];
- y_align[natoms+i] = y_align[i];
- z_align[natoms+i] = z_align[i];
- }
-
- for (i = 0; i < natoms+natoms/2+1; i++)
- {
- work[i] = 0;
- }
-
- for (i = ni0; i < ni1; i += UNROLLI)
- {
- /* We assume shifts are NOT used for all-vs-all interactions */
-
- /* Load i atom data */
- ix_SSE0 = _mm_load1_ps(x_align+i);
- iy_SSE0 = _mm_load1_ps(y_align+i);
- iz_SSE0 = _mm_load1_ps(z_align+i);
- ix_SSE1 = _mm_load1_ps(x_align+i+1);
- iy_SSE1 = _mm_load1_ps(y_align+i+1);
- iz_SSE1 = _mm_load1_ps(z_align+i+1);
- ix_SSE2 = _mm_load1_ps(x_align+i+2);
- iy_SSE2 = _mm_load1_ps(y_align+i+2);
- iz_SSE2 = _mm_load1_ps(z_align+i+2);
- ix_SSE3 = _mm_load1_ps(x_align+i+3);
- iy_SSE3 = _mm_load1_ps(y_align+i+3);
- iz_SSE3 = _mm_load1_ps(z_align+i+3);
-
- rai_SSE0 = _mm_load1_ps(gb_radius+i);
- rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
- rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
- rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
- rai_inv_SSE0 = gmx_mm_inv_ps(rai_SSE0);
- rai_inv_SSE1 = gmx_mm_inv_ps(rai_SSE1);
- rai_inv_SSE2 = gmx_mm_inv_ps(rai_SSE2);
- rai_inv_SSE3 = gmx_mm_inv_ps(rai_SSE3);
-
- sk_ai_SSE0 = _mm_load1_ps(obc_param+i);
- sk_ai_SSE1 = _mm_load1_ps(obc_param+i+1);
- sk_ai_SSE2 = _mm_load1_ps(obc_param+i+2);
- sk_ai_SSE3 = _mm_load1_ps(obc_param+i+3);
- sk2_ai_SSE0 = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
- sk2_ai_SSE1 = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
- sk2_ai_SSE2 = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
- sk2_ai_SSE3 = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
-
- sum_ai_SSE0 = _mm_setzero_ps();
- sum_ai_SSE1 = _mm_setzero_ps();
- sum_ai_SSE2 = _mm_setzero_ps();
- sum_ai_SSE3 = _mm_setzero_ps();
-
- /* Load limits for loop over neighbors */
- nj0 = jindex[4*i];
- nj1 = jindex[4*i+1];
- nj2 = jindex[4*i+2];
- nj3 = jindex[4*i+3];
-
- pmask0 = aadata->prologue_mask_gb[i];
- pmask1 = aadata->prologue_mask_gb[i+1];
- pmask2 = aadata->prologue_mask_gb[i+2];
- pmask3 = aadata->prologue_mask_gb[i+3];
- emask0 = aadata->epilogue_mask[i];
- emask1 = aadata->epilogue_mask[i+1];
- emask2 = aadata->epilogue_mask[i+2];
- emask3 = aadata->epilogue_mask[i+3];
-
- imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
- imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
- imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
- imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
-
- /* Prologue part, including exclusion mask */
- for (j = nj0; j < nj1; j += UNROLLJ)
- {
- jmask_SSE0 = _mm_load_ps((real *)pmask0);
- jmask_SSE1 = _mm_load_ps((real *)pmask1);
- jmask_SSE2 = _mm_load_ps((real *)pmask2);
- jmask_SSE3 = _mm_load_ps((real *)pmask3);
- pmask0 += UNROLLJ;
- pmask1 += UNROLLJ;
- pmask2 += UNROLLJ;
- pmask3 += UNROLLJ;
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_ps(x_align+j);
- jy_SSE = _mm_load_ps(y_align+j);
- jz_SSE = _mm_load_ps(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
- dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
- dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
- dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
- dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
- dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
- dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
- /* Combine masks */
- jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
- jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
- jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
- jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
- rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
- rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
- rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
- rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
- rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
-
- dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
- dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
- dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
- dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
-
- sk_aj_SSE = _mm_load_ps(obc_param+j);
- raj_SSE = _mm_load_ps(gb_radius+j);
- raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
-
- /* Evaluate influence of atom aj -> ai */
- t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
- t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
- t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
- t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
- t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
- t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
- t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
- t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
- t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
- t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
- t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
- t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
-
- obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
- obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
- obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
- obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
- obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
- obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
- obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
- obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
- obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
- obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
- obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
- obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
- obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
-
- uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
- uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
- uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
- lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
- _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
- lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
- _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
- lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
- _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
- lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
- _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
- dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
- dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
- dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
- uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
- uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
- uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
- uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
- uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
- uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
- lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
- lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
- lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
- lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
- lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
- lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
- diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
- diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
- diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
- lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
- lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
- lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
- sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
- sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
- sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
- sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
- prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
- prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
- prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
- logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
- logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
- logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-
- t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
- t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
- t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
- t2_SSE0 = _mm_mul_ps(diff2_SSE0,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_ps(diff2_SSE1,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t2_SSE2 = _mm_mul_ps(diff2_SSE2,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
- prod_SSE2));
- t2_SSE3 = _mm_mul_ps(diff2_SSE3,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
- prod_SSE3));
-
- t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
- t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
- t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
- t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
- t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
- t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
- t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
- t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
- t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
- t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
- t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
- t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
- t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
- t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
- t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
- t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
- sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-
- t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
- _mm_mul_ps(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
- _mm_mul_ps(prod_SSE1, lij3_SSE1));
- t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
- _mm_mul_ps(prod_SSE2, lij3_SSE2));
- t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
- _mm_mul_ps(prod_SSE3, lij3_SSE3));
- t1_SSE0 = _mm_sub_ps(t1_SSE0,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
- _mm_mul_ps(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_ps(t1_SSE1,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
- _mm_mul_ps(lij3_SSE1, dr_SSE1))));
- t1_SSE2 = _mm_sub_ps(t1_SSE2,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
- _mm_mul_ps(lij3_SSE2, dr_SSE2))));
- t1_SSE3 = _mm_sub_ps(t1_SSE3,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
- _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-
- t2_SSE0 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
- _mm_mul_ps(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
- _mm_mul_ps(uij3_SSE1, dr_SSE1)));
- t2_SSE2 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
- _mm_mul_ps(uij3_SSE2, dr_SSE2)));
- t2_SSE3 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
- _mm_mul_ps(uij3_SSE3, dr_SSE3)));
- t2_SSE0 = _mm_sub_ps(t2_SSE0,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
- _mm_mul_ps(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_ps(t2_SSE1,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
- _mm_mul_ps(prod_SSE1, uij3_SSE1)));
- t2_SSE2 = _mm_sub_ps(t2_SSE2,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
- _mm_mul_ps(prod_SSE2, uij3_SSE2)));
- t2_SSE3 = _mm_sub_ps(t2_SSE3,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
- _mm_mul_ps(prod_SSE3, uij3_SSE3)));
- t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
- _mm_mul_ps(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
- _mm_mul_ps(rinv_SSE1, rinv_SSE1));
- t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
- _mm_mul_ps(rinv_SSE2, rinv_SSE2));
- t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
- _mm_mul_ps(rinv_SSE3, rinv_SSE3));
- t3_SSE0 = _mm_sub_ps(t3_SSE0,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_ps(t3_SSE1,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
- t3_SSE2 = _mm_sub_ps(t3_SSE2,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
- t3_SSE3 = _mm_sub_ps(t3_SSE3,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
- t1_SSE0 = _mm_mul_ps(rinv_SSE0,
- _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
- _mm_add_ps(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_ps(rinv_SSE1,
- _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
- _mm_add_ps(t2_SSE1, t3_SSE1)));
- t1_SSE2 = _mm_mul_ps(rinv_SSE2,
- _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
- _mm_add_ps(t2_SSE2, t3_SSE2)));
- t1_SSE3 = _mm_mul_ps(rinv_SSE3,
- _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
- _mm_add_ps(t2_SSE3, t3_SSE3)));
-
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
- dadx += 4;
-
- /* Evaluate influence of atom ai -> aj */
- t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
- t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
- t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
- t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
- t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
- t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
- t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
- t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
- t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
- t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
- t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
- t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
-
- obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
- obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
- obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
- obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
- obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
- obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
- obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
- obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
- obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
- obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
- obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
- obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
- obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
-
- uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
- uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
- uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
- lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
- _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
- lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
- _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
- lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
- _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
- lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
- _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
- dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
- dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
- dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
- uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
- uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
- uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
- uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
- uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
- uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
- lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
- lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
- lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
- lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
- lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
- lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
- diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
- diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
- diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
- lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
- lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
- lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
- sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
- sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
- sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
- prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
- prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
- prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
- logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
- logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
- logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
- t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
- t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
- t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
- t2_SSE0 = _mm_mul_ps(diff2_SSE0,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_ps(diff2_SSE1,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t2_SSE2 = _mm_mul_ps(diff2_SSE2,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
- prod_SSE2));
- t2_SSE3 = _mm_mul_ps(diff2_SSE3,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
- prod_SSE3));
- t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
- t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
- t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
- t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
- t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
- t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
- t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
- t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
- t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
- t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
- t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
- t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
- t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
- t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
- t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
- t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
- _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
- gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
- _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
- _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
- _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
-
- t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
- _mm_mul_ps(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
- _mm_mul_ps(prod_SSE1, lij3_SSE1));
- t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
- _mm_mul_ps(prod_SSE2, lij3_SSE2));
- t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
- _mm_mul_ps(prod_SSE3, lij3_SSE3));
- t1_SSE0 = _mm_sub_ps(t1_SSE0,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
- _mm_mul_ps(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_ps(t1_SSE1,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
- _mm_mul_ps(lij3_SSE1, dr_SSE1))));
- t1_SSE2 = _mm_sub_ps(t1_SSE2,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
- _mm_mul_ps(lij3_SSE2, dr_SSE2))));
- t1_SSE3 = _mm_sub_ps(t1_SSE3,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
- _mm_mul_ps(lij3_SSE3, dr_SSE3))));
- t2_SSE0 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
- _mm_mul_ps(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
- _mm_mul_ps(uij3_SSE1, dr_SSE1)));
- t2_SSE2 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
- _mm_mul_ps(uij3_SSE2, dr_SSE2)));
- t2_SSE3 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
- _mm_mul_ps(uij3_SSE3, dr_SSE3)));
- t2_SSE0 = _mm_sub_ps(t2_SSE0,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
- _mm_mul_ps(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_ps(t2_SSE1,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
- _mm_mul_ps(prod_SSE1, uij3_SSE1)));
- t2_SSE2 = _mm_sub_ps(t2_SSE2,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
- _mm_mul_ps(prod_SSE2, uij3_SSE2)));
- t2_SSE3 = _mm_sub_ps(t2_SSE3,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
- _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-
- t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
- _mm_mul_ps(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
- _mm_mul_ps(rinv_SSE1, rinv_SSE1));
- t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
- _mm_mul_ps(rinv_SSE2, rinv_SSE2));
- t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
- _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-
- t3_SSE0 = _mm_sub_ps(t3_SSE0,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_ps(t3_SSE1,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
- t3_SSE2 = _mm_sub_ps(t3_SSE2,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
- t3_SSE3 = _mm_sub_ps(t3_SSE3,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
-
- t1_SSE0 = _mm_mul_ps(rinv_SSE0,
- _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
- _mm_add_ps(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_ps(rinv_SSE1,
- _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
- _mm_add_ps(t2_SSE1, t3_SSE1)));
- t1_SSE2 = _mm_mul_ps(rinv_SSE2,
- _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
- _mm_add_ps(t2_SSE2, t3_SSE2)));
- t1_SSE3 = _mm_mul_ps(rinv_SSE3,
- _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
- _mm_add_ps(t2_SSE3, t3_SSE3)));
-
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
- dadx += 4;
-
- }
-
- /* Main part, no exclusions */
- for (j = nj1; j < nj2; j += UNROLLJ)
- {
- /* load j atom coordinates */
- jx_SSE = _mm_load_ps(x_align+j);
- jy_SSE = _mm_load_ps(y_align+j);
- jz_SSE = _mm_load_ps(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
- dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
- dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
- dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
- dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
- dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
- dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
- rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
- rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_ps(rinv_SSE0, imask_SSE0);
- rinv_SSE1 = _mm_and_ps(rinv_SSE1, imask_SSE1);
- rinv_SSE2 = _mm_and_ps(rinv_SSE2, imask_SSE2);
- rinv_SSE3 = _mm_and_ps(rinv_SSE3, imask_SSE3);
-
- dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
- dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
- dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
- dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
-
- sk_aj_SSE = _mm_load_ps(obc_param+j);
- raj_SSE = _mm_load_ps(gb_radius+j);
-
- raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
-
- /* Evaluate influence of atom aj -> ai */
- t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
- t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
- t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
- t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
- t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
- t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
- t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
- t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
- t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
- t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
- t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
- t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
-
- obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
- obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
- obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
- obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
- obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
- obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
- obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
- obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
- obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
- obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
- obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
- obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
- obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
-
- uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
- uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
- uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
- lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
- _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
- lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
- _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
- lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
- _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
- lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
- _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
- dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
- dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
- dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
- uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
- uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
- uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
- uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
- uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
- uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
- lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
- lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
- lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
- lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
- lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
- lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
- diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
- diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
- diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
- lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
- lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
- lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
- sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
- sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
- sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
- sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
- prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
- prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
- prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
- logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
- logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
- logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-
- t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
- t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
- t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
- t2_SSE0 = _mm_mul_ps(diff2_SSE0,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_ps(diff2_SSE1,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t2_SSE2 = _mm_mul_ps(diff2_SSE2,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
- prod_SSE2));
- t2_SSE3 = _mm_mul_ps(diff2_SSE3,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
- prod_SSE3));
-
- t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
- t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
- t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
- t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
- t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
- t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
- t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
- t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
- t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
- t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
- t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
- t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
- t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
- t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
- t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
- t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
- sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-
- t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
- _mm_mul_ps(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
- _mm_mul_ps(prod_SSE1, lij3_SSE1));
- t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
- _mm_mul_ps(prod_SSE2, lij3_SSE2));
- t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
- _mm_mul_ps(prod_SSE3, lij3_SSE3));
- t1_SSE0 = _mm_sub_ps(t1_SSE0,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
- _mm_mul_ps(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_ps(t1_SSE1,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
- _mm_mul_ps(lij3_SSE1, dr_SSE1))));
- t1_SSE2 = _mm_sub_ps(t1_SSE2,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
- _mm_mul_ps(lij3_SSE2, dr_SSE2))));
- t1_SSE3 = _mm_sub_ps(t1_SSE3,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
- _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-
- t2_SSE0 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
- _mm_mul_ps(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
- _mm_mul_ps(uij3_SSE1, dr_SSE1)));
- t2_SSE2 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
- _mm_mul_ps(uij3_SSE2, dr_SSE2)));
- t2_SSE3 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
- _mm_mul_ps(uij3_SSE3, dr_SSE3)));
- t2_SSE0 = _mm_sub_ps(t2_SSE0,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
- _mm_mul_ps(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_ps(t2_SSE1,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
- _mm_mul_ps(prod_SSE1, uij3_SSE1)));
- t2_SSE2 = _mm_sub_ps(t2_SSE2,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
- _mm_mul_ps(prod_SSE2, uij3_SSE2)));
- t2_SSE3 = _mm_sub_ps(t2_SSE3,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
- _mm_mul_ps(prod_SSE3, uij3_SSE3)));
- t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
- _mm_mul_ps(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
- _mm_mul_ps(rinv_SSE1, rinv_SSE1));
- t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
- _mm_mul_ps(rinv_SSE2, rinv_SSE2));
- t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
- _mm_mul_ps(rinv_SSE3, rinv_SSE3));
- t3_SSE0 = _mm_sub_ps(t3_SSE0,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_ps(t3_SSE1,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
- t3_SSE2 = _mm_sub_ps(t3_SSE2,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
- t3_SSE3 = _mm_sub_ps(t3_SSE3,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
- t1_SSE0 = _mm_mul_ps(rinv_SSE0,
- _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
- _mm_add_ps(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_ps(rinv_SSE1,
- _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
- _mm_add_ps(t2_SSE1, t3_SSE1)));
- t1_SSE2 = _mm_mul_ps(rinv_SSE2,
- _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
- _mm_add_ps(t2_SSE2, t3_SSE2)));
- t1_SSE3 = _mm_mul_ps(rinv_SSE3,
- _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
- _mm_add_ps(t2_SSE3, t3_SSE3)));
-
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
- dadx += 4;
-
- /* Evaluate influence of atom ai -> aj */
- t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
- t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
- t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
- t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
- t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
- t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
- t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
- t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
- t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
- t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
- t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
- t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
-
- obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
- obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
- obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
- obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
- obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
- obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
- obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
- obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
- obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
- obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
- obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
- obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
- obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
-
- uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
- uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
- uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
- lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
- _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
- lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
- _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
- lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
- _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
- lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
- _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
- dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
- dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
- dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
- uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
- uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
- uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
- uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
- uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
- uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
- lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
- lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
- lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
- lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
- lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
- lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
- diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
- diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
- diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
- lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
- lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
- lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
- sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
- sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
- sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
- prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
- prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
- prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
- logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
- logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
- logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
- t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
- t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
- t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
- t2_SSE0 = _mm_mul_ps(diff2_SSE0,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_ps(diff2_SSE1,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t2_SSE2 = _mm_mul_ps(diff2_SSE2,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
- prod_SSE2));
- t2_SSE3 = _mm_mul_ps(diff2_SSE3,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
- prod_SSE3));
- t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
- t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
- t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
- t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
- t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
- t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
- t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
- t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
- t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
- t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
- t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
- t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
- t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
- t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
- t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
- t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
- _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
- gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
- _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
- _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
- _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
-
- t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
- _mm_mul_ps(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
- _mm_mul_ps(prod_SSE1, lij3_SSE1));
- t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
- _mm_mul_ps(prod_SSE2, lij3_SSE2));
- t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
- _mm_mul_ps(prod_SSE3, lij3_SSE3));
- t1_SSE0 = _mm_sub_ps(t1_SSE0,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
- _mm_mul_ps(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_ps(t1_SSE1,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
- _mm_mul_ps(lij3_SSE1, dr_SSE1))));
- t1_SSE2 = _mm_sub_ps(t1_SSE2,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
- _mm_mul_ps(lij3_SSE2, dr_SSE2))));
- t1_SSE3 = _mm_sub_ps(t1_SSE3,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
- _mm_mul_ps(lij3_SSE3, dr_SSE3))));
- t2_SSE0 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
- _mm_mul_ps(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
- _mm_mul_ps(uij3_SSE1, dr_SSE1)));
- t2_SSE2 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
- _mm_mul_ps(uij3_SSE2, dr_SSE2)));
- t2_SSE3 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
- _mm_mul_ps(uij3_SSE3, dr_SSE3)));
- t2_SSE0 = _mm_sub_ps(t2_SSE0,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
- _mm_mul_ps(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_ps(t2_SSE1,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
- _mm_mul_ps(prod_SSE1, uij3_SSE1)));
- t2_SSE2 = _mm_sub_ps(t2_SSE2,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
- _mm_mul_ps(prod_SSE2, uij3_SSE2)));
- t2_SSE3 = _mm_sub_ps(t2_SSE3,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
- _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-
- t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
- _mm_mul_ps(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
- _mm_mul_ps(rinv_SSE1, rinv_SSE1));
- t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
- _mm_mul_ps(rinv_SSE2, rinv_SSE2));
- t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
- _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-
- t3_SSE0 = _mm_sub_ps(t3_SSE0,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_ps(t3_SSE1,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
- t3_SSE2 = _mm_sub_ps(t3_SSE2,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
- t3_SSE3 = _mm_sub_ps(t3_SSE3,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
- t1_SSE0 = _mm_mul_ps(rinv_SSE0,
- _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
- _mm_add_ps(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_ps(rinv_SSE1,
- _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
- _mm_add_ps(t2_SSE1, t3_SSE1)));
- t1_SSE2 = _mm_mul_ps(rinv_SSE2,
- _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
- _mm_add_ps(t2_SSE2, t3_SSE2)));
- t1_SSE3 = _mm_mul_ps(rinv_SSE3,
- _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
- _mm_add_ps(t2_SSE3, t3_SSE3)));
-
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
- dadx += 4;
- }
-
- /* Epilogue part, including exclusion mask */
- for (j = nj2; j < nj3; j += UNROLLJ)
- {
- jmask_SSE0 = _mm_load_ps((real *)emask0);
- jmask_SSE1 = _mm_load_ps((real *)emask1);
- jmask_SSE2 = _mm_load_ps((real *)emask2);
- jmask_SSE3 = _mm_load_ps((real *)emask3);
- emask0 += UNROLLJ;
- emask1 += UNROLLJ;
- emask2 += UNROLLJ;
- emask3 += UNROLLJ;
-
- /* load j atom coordinates */
- jx_SSE = _mm_load_ps(x_align+j);
- jy_SSE = _mm_load_ps(y_align+j);
- jz_SSE = _mm_load_ps(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
- dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
- dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
- dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
- dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
- dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
- dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
-
- /* rsq = dx*dx+dy*dy+dz*dz */
- rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
- rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
- rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
- rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
- /* Combine masks */
- jmask_SSE0 = _mm_and_ps(jmask_SSE0, imask_SSE0);
- jmask_SSE1 = _mm_and_ps(jmask_SSE1, imask_SSE1);
- jmask_SSE2 = _mm_and_ps(jmask_SSE2, imask_SSE2);
- jmask_SSE3 = _mm_and_ps(jmask_SSE3, imask_SSE3);
-
- /* Calculate 1/r and 1/r2 */
- rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
- rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
- rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
- rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
-
- /* Apply mask */
- rinv_SSE0 = _mm_and_ps(rinv_SSE0, jmask_SSE0);
- rinv_SSE1 = _mm_and_ps(rinv_SSE1, jmask_SSE1);
- rinv_SSE2 = _mm_and_ps(rinv_SSE2, jmask_SSE2);
- rinv_SSE3 = _mm_and_ps(rinv_SSE3, jmask_SSE3);
-
- dr_SSE0 = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
- dr_SSE1 = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
- dr_SSE2 = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
- dr_SSE3 = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
-
- sk_aj_SSE = _mm_load_ps(obc_param+j);
- raj_SSE = _mm_load_ps(gb_radius+j);
-
- raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
-
- /* Evaluate influence of atom aj -> ai */
- t1_SSE0 = _mm_add_ps(dr_SSE0, sk_aj_SSE);
- t1_SSE1 = _mm_add_ps(dr_SSE1, sk_aj_SSE);
- t1_SSE2 = _mm_add_ps(dr_SSE2, sk_aj_SSE);
- t1_SSE3 = _mm_add_ps(dr_SSE3, sk_aj_SSE);
- t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
- t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
- t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
- t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
- t3_SSE0 = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
- t3_SSE1 = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
- t3_SSE2 = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
- t3_SSE3 = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
-
- obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
- obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
- obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
- obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
- obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
- obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
- obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
- obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
- obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
- obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
- obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
- obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
- obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
-
- uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
- uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
- uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
- lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
- _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
- lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
- _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
- lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
- _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
- lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
- _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
- dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
- dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
- dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
- uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
- uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
- uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
- uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
- uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
- uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
- lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
- lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
- lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
- lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
- lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
- lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
- diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
- diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
- diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
- lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
- lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
- lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
- sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
- sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
- sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
- sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
- prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
- prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
- prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
- logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
- logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
- logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-
- t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
- t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
- t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
- t2_SSE0 = _mm_mul_ps(diff2_SSE0,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_ps(diff2_SSE1,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t2_SSE2 = _mm_mul_ps(diff2_SSE2,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
- prod_SSE2));
- t2_SSE3 = _mm_mul_ps(diff2_SSE3,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
- prod_SSE3));
-
- t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
- t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
- t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
- t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
- t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
- t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
- t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
- t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
- t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
- t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
- t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
- t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
- t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
- t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
- t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
- t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
- sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-
- t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
- _mm_mul_ps(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
- _mm_mul_ps(prod_SSE1, lij3_SSE1));
- t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
- _mm_mul_ps(prod_SSE2, lij3_SSE2));
- t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
- _mm_mul_ps(prod_SSE3, lij3_SSE3));
- t1_SSE0 = _mm_sub_ps(t1_SSE0,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
- _mm_mul_ps(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_ps(t1_SSE1,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
- _mm_mul_ps(lij3_SSE1, dr_SSE1))));
- t1_SSE2 = _mm_sub_ps(t1_SSE2,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
- _mm_mul_ps(lij3_SSE2, dr_SSE2))));
- t1_SSE3 = _mm_sub_ps(t1_SSE3,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
- _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-
- t2_SSE0 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
- _mm_mul_ps(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
- _mm_mul_ps(uij3_SSE1, dr_SSE1)));
- t2_SSE2 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
- _mm_mul_ps(uij3_SSE2, dr_SSE2)));
- t2_SSE3 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
- _mm_mul_ps(uij3_SSE3, dr_SSE3)));
- t2_SSE0 = _mm_sub_ps(t2_SSE0,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
- _mm_mul_ps(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_ps(t2_SSE1,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
- _mm_mul_ps(prod_SSE1, uij3_SSE1)));
- t2_SSE2 = _mm_sub_ps(t2_SSE2,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
- _mm_mul_ps(prod_SSE2, uij3_SSE2)));
- t2_SSE3 = _mm_sub_ps(t2_SSE3,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
- _mm_mul_ps(prod_SSE3, uij3_SSE3)));
- t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
- _mm_mul_ps(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
- _mm_mul_ps(rinv_SSE1, rinv_SSE1));
- t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
- _mm_mul_ps(rinv_SSE2, rinv_SSE2));
- t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
- _mm_mul_ps(rinv_SSE3, rinv_SSE3));
- t3_SSE0 = _mm_sub_ps(t3_SSE0,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_ps(t3_SSE1,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
- t3_SSE2 = _mm_sub_ps(t3_SSE2,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
- t3_SSE3 = _mm_sub_ps(t3_SSE3,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
- t1_SSE0 = _mm_mul_ps(rinv_SSE0,
- _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
- _mm_add_ps(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_ps(rinv_SSE1,
- _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
- _mm_add_ps(t2_SSE1, t3_SSE1)));
- t1_SSE2 = _mm_mul_ps(rinv_SSE2,
- _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
- _mm_add_ps(t2_SSE2, t3_SSE2)));
- t1_SSE3 = _mm_mul_ps(rinv_SSE3,
- _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
- _mm_add_ps(t2_SSE3, t3_SSE3)));
-
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
- dadx += 4;
-
- /* Evaluate influence of atom ai -> aj */
- t1_SSE0 = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
- t1_SSE1 = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
- t1_SSE2 = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
- t1_SSE3 = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
- t2_SSE0 = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
- t2_SSE1 = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
- t2_SSE2 = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
- t2_SSE3 = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
- t3_SSE0 = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
- t3_SSE1 = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
- t3_SSE2 = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
- t3_SSE3 = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
-
- obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
- obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
- obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
- obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
- obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
- obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
- obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
- obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
- obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
- obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
- obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
- obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
- obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
- obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
- obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
- obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
-
- uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
- uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
- uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
- uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
- lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
- _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
- lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
- _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
- lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
- _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
- lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
- _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
- dlij_SSE0 = _mm_and_ps(one_SSE, obc_mask2_SSE0);
- dlij_SSE1 = _mm_and_ps(one_SSE, obc_mask2_SSE1);
- dlij_SSE2 = _mm_and_ps(one_SSE, obc_mask2_SSE2);
- dlij_SSE3 = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
- uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
- uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
- uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
- uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
- uij3_SSE0 = _mm_mul_ps(uij2_SSE0, uij_SSE0);
- uij3_SSE1 = _mm_mul_ps(uij2_SSE1, uij_SSE1);
- uij3_SSE2 = _mm_mul_ps(uij2_SSE2, uij_SSE2);
- uij3_SSE3 = _mm_mul_ps(uij2_SSE3, uij_SSE3);
- lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
- lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
- lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
- lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
- lij3_SSE0 = _mm_mul_ps(lij2_SSE0, lij_SSE0);
- lij3_SSE1 = _mm_mul_ps(lij2_SSE1, lij_SSE1);
- lij3_SSE2 = _mm_mul_ps(lij2_SSE2, lij_SSE2);
- lij3_SSE3 = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
- diff2_SSE0 = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
- diff2_SSE1 = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
- diff2_SSE2 = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
- diff2_SSE3 = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
- lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
- lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
- lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
- lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
- sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
- sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
- sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
- sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
- prod_SSE0 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
- prod_SSE1 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
- prod_SSE2 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
- prod_SSE3 = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
- logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
- logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
- logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
- logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
- t1_SSE0 = _mm_sub_ps(lij_SSE0, uij_SSE0);
- t1_SSE1 = _mm_sub_ps(lij_SSE1, uij_SSE1);
- t1_SSE2 = _mm_sub_ps(lij_SSE2, uij_SSE2);
- t1_SSE3 = _mm_sub_ps(lij_SSE3, uij_SSE3);
- t2_SSE0 = _mm_mul_ps(diff2_SSE0,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
- prod_SSE0));
- t2_SSE1 = _mm_mul_ps(diff2_SSE1,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
- prod_SSE1));
- t2_SSE2 = _mm_mul_ps(diff2_SSE2,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
- prod_SSE2));
- t2_SSE3 = _mm_mul_ps(diff2_SSE3,
- _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
- prod_SSE3));
- t3_SSE0 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
- t3_SSE1 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
- t3_SSE2 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
- t3_SSE3 = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
- t1_SSE0 = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
- t1_SSE1 = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
- t1_SSE2 = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
- t1_SSE3 = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
- t4_SSE0 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
- t4_SSE1 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
- t4_SSE2 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
- t4_SSE3 = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
- t4_SSE0 = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
- t4_SSE1 = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
- t4_SSE2 = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
- t4_SSE3 = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
- t1_SSE0 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
- t1_SSE1 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
- t1_SSE2 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
- t1_SSE3 = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
- _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
- gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
- _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
- _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
- _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
-
- t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
- _mm_mul_ps(prod_SSE0, lij3_SSE0));
- t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
- _mm_mul_ps(prod_SSE1, lij3_SSE1));
- t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
- _mm_mul_ps(prod_SSE2, lij3_SSE2));
- t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
- _mm_mul_ps(prod_SSE3, lij3_SSE3));
- t1_SSE0 = _mm_sub_ps(t1_SSE0,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
- _mm_mul_ps(lij3_SSE0, dr_SSE0))));
- t1_SSE1 = _mm_sub_ps(t1_SSE1,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
- _mm_mul_ps(lij3_SSE1, dr_SSE1))));
- t1_SSE2 = _mm_sub_ps(t1_SSE2,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
- _mm_mul_ps(lij3_SSE2, dr_SSE2))));
- t1_SSE3 = _mm_sub_ps(t1_SSE3,
- _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
- _mm_mul_ps(lij3_SSE3, dr_SSE3))));
- t2_SSE0 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
- _mm_mul_ps(uij3_SSE0, dr_SSE0)));
- t2_SSE1 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
- _mm_mul_ps(uij3_SSE1, dr_SSE1)));
- t2_SSE2 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
- _mm_mul_ps(uij3_SSE2, dr_SSE2)));
- t2_SSE3 = _mm_mul_ps(onefourth_SSE,
- _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
- _mm_mul_ps(uij3_SSE3, dr_SSE3)));
- t2_SSE0 = _mm_sub_ps(t2_SSE0,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
- _mm_mul_ps(prod_SSE0, uij3_SSE0)));
- t2_SSE1 = _mm_sub_ps(t2_SSE1,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
- _mm_mul_ps(prod_SSE1, uij3_SSE1)));
- t2_SSE2 = _mm_sub_ps(t2_SSE2,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
- _mm_mul_ps(prod_SSE2, uij3_SSE2)));
- t2_SSE3 = _mm_sub_ps(t2_SSE3,
- _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
- _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-
- t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
- _mm_mul_ps(rinv_SSE0, rinv_SSE0));
- t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
- _mm_mul_ps(rinv_SSE1, rinv_SSE1));
- t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
- _mm_mul_ps(rinv_SSE2, rinv_SSE2));
- t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
- _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-
- t3_SSE0 = _mm_sub_ps(t3_SSE0,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
- t3_SSE1 = _mm_sub_ps(t3_SSE1,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
- t3_SSE2 = _mm_sub_ps(t3_SSE2,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
- t3_SSE3 = _mm_sub_ps(t3_SSE3,
- _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
- _mm_add_ps(one_SSE,
- _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
-
- t1_SSE0 = _mm_mul_ps(rinv_SSE0,
- _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
- _mm_add_ps(t2_SSE0, t3_SSE0)));
- t1_SSE1 = _mm_mul_ps(rinv_SSE1,
- _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
- _mm_add_ps(t2_SSE1, t3_SSE1)));
- t1_SSE2 = _mm_mul_ps(rinv_SSE2,
- _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
- _mm_add_ps(t2_SSE2, t3_SSE2)));
- t1_SSE3 = _mm_mul_ps(rinv_SSE3,
- _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
- _mm_add_ps(t2_SSE3, t3_SSE3)));
-
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
- dadx += 4;
- _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
- dadx += 4;
- }
- _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
- sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
- sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
- sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
- _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
- }
-
-
- for (i = 0; i < natoms/2+1; i++)
- {
- work[i] += work[natoms+i];
- }
-
- /* Parallel summations would go here if ever implemented with DD */
-
- if (gb_algorithm == egbHCT)
- {
- /* HCT */
- for (i = 0; i < natoms; i++)
- {
- if (born->use[i] != 0)
- {
- rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
- sum_ai = 1.0/rai - work[i];
- min_rad = rai + born->gb_doffset;
- rad = 1.0/sum_ai;
-
- born->bRad[i] = rad > min_rad ? rad : min_rad;
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
- }
- }
-
- }
- else
- {
- /* OBC */
-
- /* Calculate the radii */
- for (i = 0; i < natoms; i++)
- {
-
- if (born->use[i] != 0)
- {
- rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
- rai_inv2 = 1.0/rai;
- rai = rai-born->gb_doffset;
- rai_inv = 1.0/rai;
- sum_ai = rai * work[i];
- sum_ai2 = sum_ai * sum_ai;
- sum_ai3 = sum_ai2 * sum_ai;
-
- tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
- born->bRad[i] = rai_inv - tsum*rai_inv2;
- born->bRad[i] = 1.0 / born->bRad[i];
-
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-
- tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
- born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
- }
- }
- }
-
- return 0;
-}
-
-
-
-
-
-
-
-
-int
-genborn_allvsall_calc_chainrule_sse2_single(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- real * x,
- real * f,
- int gb_algorithm,
- void * paadata)
-{
- gmx_allvsallgb2_data_t *aadata;
- int natoms;
- int ni0, ni1;
- int nj0, nj1, nj2, nj3;
- int i, j, k, n;
- int idx;
- int * mask;
- int * pmask0;
- int * emask0;
- int * jindex;
-
- real ix, iy, iz;
- real fix, fiy, fiz;
- real jx, jy, jz;
- real dx, dy, dz;
- real tx, ty, tz;
- real rbai, rbaj, fgb, fgb_ai, rbi;
- real * rb;
- real * dadx;
- real * x_align;
- real * y_align;
- real * z_align;
- real * fx_align;
- real * fy_align;
- real * fz_align;
- real tmpsum[4];
-
- __m128 jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
- __m128 ix_SSE0, iy_SSE0, iz_SSE0;
- __m128 ix_SSE1, iy_SSE1, iz_SSE1;
- __m128 ix_SSE2, iy_SSE2, iz_SSE2;
- __m128 ix_SSE3, iy_SSE3, iz_SSE3;
- __m128 fix_SSE0, fiy_SSE0, fiz_SSE0;
- __m128 fix_SSE1, fiy_SSE1, fiz_SSE1;
- __m128 fix_SSE2, fiy_SSE2, fiz_SSE2;
- __m128 fix_SSE3, fiy_SSE3, fiz_SSE3;
- __m128 rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
- __m128 imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
- __m128 jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
- __m128 dx_SSE0, dy_SSE0, dz_SSE0;
- __m128 dx_SSE1, dy_SSE1, dz_SSE1;
- __m128 dx_SSE2, dy_SSE2, dz_SSE2;
- __m128 dx_SSE3, dy_SSE3, dz_SSE3;
- __m128 fgb_SSE0, fgb_ai_SSE0;
- __m128 fgb_SSE1, fgb_ai_SSE1;
- __m128 fgb_SSE2, fgb_ai_SSE2;
- __m128 fgb_SSE3, fgb_ai_SSE3;
- __m128 tx_SSE0, ty_SSE0, tz_SSE0;
- __m128 tx_SSE1, ty_SSE1, tz_SSE1;
- __m128 tx_SSE2, ty_SSE2, tz_SSE2;
- __m128 tx_SSE3, ty_SSE3, tz_SSE3;
- __m128 t1, t2;
-
- natoms = mdatoms->nr;
- ni0 = 0;
- ni1 = mdatoms->homenr;
- dadx = fr->dadx;
-
- aadata = (gmx_allvsallgb2_data_t *)paadata;
-
- x_align = aadata->x_align;
- y_align = aadata->y_align;
- z_align = aadata->z_align;
- fx_align = aadata->fx_align;
- fy_align = aadata->fy_align;
- fz_align = aadata->fz_align;
-
- jindex = aadata->jindex_gb;
- dadx = fr->dadx;
-
- n = 0;
- rb = aadata->work;
-
- /* Loop to get the proper form for the Born radius term */
- if (gb_algorithm == egbSTILL)
- {
- for (i = 0; i < natoms; i++)
- {
- rbi = born->bRad[i];
- rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
- }
- }
- else if (gb_algorithm == egbHCT)
- {
- for (i = 0; i < natoms; i++)
- {
- rbi = born->bRad[i];
- rb[i] = rbi * rbi * fr->dvda[i];
- }
- }
- else if (gb_algorithm == egbOBC)
- {
- for (idx = 0; idx < natoms; idx++)
- {
- rbi = born->bRad[idx];
- rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
- }
- }
-
- for (i = 0; i < 2*natoms; i++)
- {
- fx_align[i] = 0;
- fy_align[i] = 0;
- fz_align[i] = 0;
- }
-
-
- for (i = 0; i < natoms; i++)
- {
- rb[i+natoms] = rb[i];
- }
-
- for (i = ni0; i < ni1; i += UNROLLI)
- {
- /* We assume shifts are NOT used for all-vs-all interactions */
-
- /* Load i atom data */
- ix_SSE0 = _mm_load1_ps(x_align+i);
- iy_SSE0 = _mm_load1_ps(y_align+i);
- iz_SSE0 = _mm_load1_ps(z_align+i);
- ix_SSE1 = _mm_load1_ps(x_align+i+1);
- iy_SSE1 = _mm_load1_ps(y_align+i+1);
- iz_SSE1 = _mm_load1_ps(z_align+i+1);
- ix_SSE2 = _mm_load1_ps(x_align+i+2);
- iy_SSE2 = _mm_load1_ps(y_align+i+2);
- iz_SSE2 = _mm_load1_ps(z_align+i+2);
- ix_SSE3 = _mm_load1_ps(x_align+i+3);
- iy_SSE3 = _mm_load1_ps(y_align+i+3);
- iz_SSE3 = _mm_load1_ps(z_align+i+3);
-
- fix_SSE0 = _mm_setzero_ps();
- fiy_SSE0 = _mm_setzero_ps();
- fiz_SSE0 = _mm_setzero_ps();
- fix_SSE1 = _mm_setzero_ps();
- fiy_SSE1 = _mm_setzero_ps();
- fiz_SSE1 = _mm_setzero_ps();
- fix_SSE2 = _mm_setzero_ps();
- fiy_SSE2 = _mm_setzero_ps();
- fiz_SSE2 = _mm_setzero_ps();
- fix_SSE3 = _mm_setzero_ps();
- fiy_SSE3 = _mm_setzero_ps();
- fiz_SSE3 = _mm_setzero_ps();
-
- rbai_SSE0 = _mm_load1_ps(rb+i);
- rbai_SSE1 = _mm_load1_ps(rb+i+1);
- rbai_SSE2 = _mm_load1_ps(rb+i+2);
- rbai_SSE3 = _mm_load1_ps(rb+i+3);
-
- /* Load limits for loop over neighbors */
- nj0 = jindex[4*i];
- nj3 = jindex[4*i+3];
-
- /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
- for (j = nj0; j < nj3; j += UNROLLJ)
- {
- /* load j atom coordinates */
- jx_SSE = _mm_load_ps(x_align+j);
- jy_SSE = _mm_load_ps(y_align+j);
- jz_SSE = _mm_load_ps(z_align+j);
-
- /* Calculate distance */
- dx_SSE0 = _mm_sub_ps(ix_SSE0, jx_SSE);
- dy_SSE0 = _mm_sub_ps(iy_SSE0, jy_SSE);
- dz_SSE0 = _mm_sub_ps(iz_SSE0, jz_SSE);
- dx_SSE1 = _mm_sub_ps(ix_SSE1, jx_SSE);
- dy_SSE1 = _mm_sub_ps(iy_SSE1, jy_SSE);
- dz_SSE1 = _mm_sub_ps(iz_SSE1, jz_SSE);
- dx_SSE2 = _mm_sub_ps(ix_SSE2, jx_SSE);
- dy_SSE2 = _mm_sub_ps(iy_SSE2, jy_SSE);
- dz_SSE2 = _mm_sub_ps(iz_SSE2, jz_SSE);
- dx_SSE3 = _mm_sub_ps(ix_SSE3, jx_SSE);
- dy_SSE3 = _mm_sub_ps(iy_SSE3, jy_SSE);
- dz_SSE3 = _mm_sub_ps(iz_SSE3, jz_SSE);
-
- rbaj_SSE = _mm_load_ps(rb+j);
-
- fgb_SSE0 = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
- dadx += 4;
- fgb_SSE1 = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
- dadx += 4;
- fgb_SSE2 = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
- dadx += 4;
- fgb_SSE3 = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
- dadx += 4;
-
- fgb_ai_SSE0 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
- dadx += 4;
- fgb_ai_SSE1 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
- dadx += 4;
- fgb_ai_SSE2 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
- dadx += 4;
- fgb_ai_SSE3 = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
- dadx += 4;
-
- /* Total force between ai and aj is the sum of ai->aj and aj->ai */
- fgb_SSE0 = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
- fgb_SSE1 = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
- fgb_SSE2 = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
- fgb_SSE3 = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
-
- /* Calculate temporary vectorial force */
- tx_SSE0 = _mm_mul_ps(fgb_SSE0, dx_SSE0);
- ty_SSE0 = _mm_mul_ps(fgb_SSE0, dy_SSE0);
- tz_SSE0 = _mm_mul_ps(fgb_SSE0, dz_SSE0);
- tx_SSE1 = _mm_mul_ps(fgb_SSE1, dx_SSE1);
- ty_SSE1 = _mm_mul_ps(fgb_SSE1, dy_SSE1);
- tz_SSE1 = _mm_mul_ps(fgb_SSE1, dz_SSE1);
- tx_SSE2 = _mm_mul_ps(fgb_SSE2, dx_SSE2);
- ty_SSE2 = _mm_mul_ps(fgb_SSE2, dy_SSE2);
- tz_SSE2 = _mm_mul_ps(fgb_SSE2, dz_SSE2);
- tx_SSE3 = _mm_mul_ps(fgb_SSE3, dx_SSE3);
- ty_SSE3 = _mm_mul_ps(fgb_SSE3, dy_SSE3);
- tz_SSE3 = _mm_mul_ps(fgb_SSE3, dz_SSE3);
-
- /* Increment i atom force */
- fix_SSE0 = _mm_add_ps(fix_SSE0, tx_SSE0);
- fiy_SSE0 = _mm_add_ps(fiy_SSE0, ty_SSE0);
- fiz_SSE0 = _mm_add_ps(fiz_SSE0, tz_SSE0);
- fix_SSE1 = _mm_add_ps(fix_SSE1, tx_SSE1);
- fiy_SSE1 = _mm_add_ps(fiy_SSE1, ty_SSE1);
- fiz_SSE1 = _mm_add_ps(fiz_SSE1, tz_SSE1);
- fix_SSE2 = _mm_add_ps(fix_SSE2, tx_SSE2);
- fiy_SSE2 = _mm_add_ps(fiy_SSE2, ty_SSE2);
- fiz_SSE2 = _mm_add_ps(fiz_SSE2, tz_SSE2);
- fix_SSE3 = _mm_add_ps(fix_SSE3, tx_SSE3);
- fiy_SSE3 = _mm_add_ps(fiy_SSE3, ty_SSE3);
- fiz_SSE3 = _mm_add_ps(fiz_SSE3, tz_SSE3);
-
- /* Decrement j atom force */
- _mm_store_ps(fx_align+j,
- _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
- _mm_store_ps(fy_align+j,
- _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
- _mm_store_ps(fz_align+j,
- _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
- }
- /* Add i forces to mem and shifted force list */
- _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
- fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
- fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
- fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
- _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
-
- _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
- fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
- fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
- fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
- _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
-
- _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
- fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
- fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
- fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
- _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
- }
-
- for (i = 0; i < natoms; i++)
- {
- f[3*i] += fx_align[i] + fx_align[natoms+i];
- f[3*i+1] += fy_align[i] + fy_align[natoms+i];
- f[3*i+2] += fz_align[i] + fz_align[natoms+i];
- }
-
- return 0;
-}
-
-#else
-/* dummy variable when not using SSE */
-int genborn_allvsall_sse2_single_dummy;
-
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2010,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _GENBORN_ALLVSALL_SSE2_SINGLE_H
-#define _GENBORN_ALLVSALL_SSE2_SINGLE_H
-
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/legacyheaders/types/simple.h"
-
-int
-genborn_allvsall_calc_still_radii_sse2_single(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- gmx_localtop_t * top,
- real * x,
- t_commrec * cr,
- void * work);
-
-int
-genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- int gb_algorithm,
- gmx_localtop_t * top,
- real * x,
- t_commrec * cr,
- void * work);
-
-int
-genborn_allvsall_calc_chainrule_sse2_single(t_forcerec * fr,
- t_mdatoms * mdatoms,
- gmx_genborn_t * born,
- real * x,
- real * f,
- int gb_algorithm,
- void * work);
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include <math.h>
-#include <string.h>
-
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/fileio/pdbio.h"
-#include "gromacs/legacyheaders/genborn.h"
-#include "gromacs/legacyheaders/names.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/smalloc.h"
-
-/* Only compile this file if SSE2 intrinsics are available */
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-#include "genborn_sse2_double.h"
-
-#include <emmintrin.h>
-#include <gmx_sse2_double.h>
-
-int
-calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
- int natoms, gmx_localtop_t *top,
- double *x, t_nblist *nl,
- gmx_genborn_t *born)
-{
- int i, k, n, ii, is3, ii3, nj0, nj1, offset;
- int jnrA, jnrB, j3A, j3B;
- int *mdtype;
- double shX, shY, shZ;
- int *jjnr;
- double *shiftvec;
-
- double gpi_ai, gpi2;
- double factor;
- double *gb_radius;
- double *vsolv;
- double *work;
- double *dadx;
-
- __m128d ix, iy, iz;
- __m128d jx, jy, jz;
- __m128d dx, dy, dz;
- __m128d tx, ty, tz;
- __m128d rsq, rinv, rinv2, rinv4, rinv6;
- __m128d ratio, gpi, rai, raj, vai, vaj, rvdw;
- __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
- __m128d mask, icf4, icf6, mask_cmp;
-
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
- const __m128d zero = _mm_set1_pd(0.0);
- const __m128d four = _mm_set1_pd(4.0);
-
- const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
- const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
- const __m128d still_p4 = _mm_set1_pd(STILL_P4);
-
- factor = 0.5 * ONE_4PI_EPS0;
-
- gb_radius = born->gb_radius;
- vsolv = born->vsolv;
- work = born->gpol_still_work;
- jjnr = nl->jjnr;
- shiftvec = fr->shift_vec[0];
- dadx = fr->dadx;
-
- jnrA = jnrB = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
-
- n = 0;
-
- for (i = 0; i < natoms; i++)
- {
- work[i] = 0;
- }
-
- for (i = 0; i < nl->nri; i++)
- {
- ii = nl->iinr[i];
- ii3 = ii*3;
- is3 = 3*nl->shift[i];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = nl->jindex[i];
- nj1 = nl->jindex[i+1];
-
- ix = _mm_set1_pd(shX+x[ii3+0]);
- iy = _mm_set1_pd(shY+x[ii3+1]);
- iz = _mm_set1_pd(shZ+x[ii3+2]);
-
-
- /* Polarization energy for atom ai */
- gpi = _mm_setzero_pd();
-
- rai = _mm_load1_pd(gb_radius+ii);
- prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
-
- for (k = nj0; k < nj1-1; k += 2)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
-
- j3A = 3*jnrA;
- j3B = 3*jnrB;
-
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
-
- GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
- GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
-
- dx = _mm_sub_pd(ix, jx);
- dy = _mm_sub_pd(iy, jy);
- dz = _mm_sub_pd(iz, jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
- rinv = gmx_mm_invsqrt_pd(rsq);
- rinv2 = _mm_mul_pd(rinv, rinv);
- rinv4 = _mm_mul_pd(rinv2, rinv2);
- rinv6 = _mm_mul_pd(rinv4, rinv2);
-
- rvdw = _mm_add_pd(rai, raj);
- ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
-
- mask_cmp = _mm_cmple_pd(ratio, still_p5inv);
-
- /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
- if (0 == _mm_movemask_pd(mask_cmp) )
- {
- /* if ratio>still_p5inv for ALL elements */
- ccf = one;
- dccf = _mm_setzero_pd();
- }
- else
- {
- ratio = _mm_min_pd(ratio, still_p5inv);
- theta = _mm_mul_pd(ratio, still_pip5);
- gmx_mm_sincos_pd(theta, &sinq, &cosq);
- term = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
- ccf = _mm_mul_pd(term, term);
- dccf = _mm_mul_pd(_mm_mul_pd(two, term),
- _mm_mul_pd(sinq, theta));
- }
-
- prod = _mm_mul_pd(still_p4, vaj);
- icf4 = _mm_mul_pd(ccf, rinv4);
- icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
-
- GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
-
- gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
-
- _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
- dadx += 2;
- _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
- dadx += 2;
- }
-
- if (k < nj1)
- {
- jnrA = jjnr[k];
-
- j3A = 3*jnrA;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
-
- GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
- GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
-
- dx = _mm_sub_sd(ix, jx);
- dy = _mm_sub_sd(iy, jy);
- dz = _mm_sub_sd(iz, jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
- rinv = gmx_mm_invsqrt_pd(rsq);
- rinv2 = _mm_mul_sd(rinv, rinv);
- rinv4 = _mm_mul_sd(rinv2, rinv2);
- rinv6 = _mm_mul_sd(rinv4, rinv2);
-
- rvdw = _mm_add_sd(rai, raj);
- ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
-
- mask_cmp = _mm_cmple_sd(ratio, still_p5inv);
-
- /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
- if (0 == _mm_movemask_pd(mask_cmp) )
- {
- /* if ratio>still_p5inv for ALL elements */
- ccf = one;
- dccf = _mm_setzero_pd();
- }
- else
- {
- ratio = _mm_min_sd(ratio, still_p5inv);
- theta = _mm_mul_sd(ratio, still_pip5);
- gmx_mm_sincos_pd(theta, &sinq, &cosq);
- term = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
- ccf = _mm_mul_sd(term, term);
- dccf = _mm_mul_sd(_mm_mul_sd(two, term),
- _mm_mul_sd(sinq, theta));
- }
-
- prod = _mm_mul_sd(still_p4, vaj);
- icf4 = _mm_mul_sd(ccf, rinv4);
- icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
-
- GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
-
- gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
-
- _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
- dadx += 2;
- _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
- dadx += 2;
- }
- gmx_mm_update_1pot_pd(gpi, work+ii);
- }
-
- /* Sum up the polarization energy from other nodes */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_sum_real(cr->dd, work);
- }
-
- /* Compute the radii */
- for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
- {
- if (born->use[i] != 0)
- {
- gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
- gpi2 = gpi_ai * gpi_ai;
- born->bRad[i] = factor*gmx_invsqrt(gpi2);
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
- }
- }
-
- /* Extra (local) communication required for DD */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_spread_real(cr->dd, born->bRad);
- dd_atom_spread_real(cr->dd, fr->invsqrta);
- }
-
- return 0;
-}
-
-
-int
-calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
- double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
-{
- int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
- int jnrA, jnrB;
- int j3A, j3B;
- double shX, shY, shZ;
- double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
- double sum_ai2, sum_ai3, tsum, tchain, doffset;
- double *obc_param;
- double *gb_radius;
- double *work;
- int * jjnr;
- double *dadx;
- double *shiftvec;
- double min_rad, rad;
-
- __m128d ix, iy, iz, jx, jy, jz;
- __m128d dx, dy, dz, t1, t2, t3, t4;
- __m128d rsq, rinv, r;
- __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
- __m128d uij, lij2, uij2, lij3, uij3, diff2;
- __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
- __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
- __m128d dadx1, dadx2;
- __m128d logterm;
- __m128d mask;
- __m128d obc_mask1, obc_mask2, obc_mask3;
-
- __m128d oneeighth = _mm_set1_pd(0.125);
- __m128d onefourth = _mm_set1_pd(0.25);
-
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
- const __m128d zero = _mm_set1_pd(0.0);
- const __m128d neg = _mm_set1_pd(-1.0);
-
- /* Set the dielectric offset */
- doffset = born->gb_doffset;
- gb_radius = born->gb_radius;
- obc_param = born->param;
- work = born->gpol_hct_work;
- jjnr = nl->jjnr;
- dadx = fr->dadx;
- shiftvec = fr->shift_vec[0];
-
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
-
- jnrA = jnrB = 0;
-
- for (i = 0; i < born->nr; i++)
- {
- work[i] = 0;
- }
-
- for (i = 0; i < nl->nri; i++)
- {
- ii = nl->iinr[i];
- ii3 = ii*3;
- is3 = 3*nl->shift[i];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = nl->jindex[i];
- nj1 = nl->jindex[i+1];
-
- ix = _mm_set1_pd(shX+x[ii3+0]);
- iy = _mm_set1_pd(shY+x[ii3+1]);
- iz = _mm_set1_pd(shZ+x[ii3+2]);
-
- rai = _mm_load1_pd(gb_radius+ii);
- rai_inv = gmx_mm_inv_pd(rai);
-
- sum_ai = _mm_setzero_pd();
-
- sk_ai = _mm_load1_pd(born->param+ii);
- sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
-
- for (k = nj0; k < nj1-1; k += 2)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
-
- j3A = 3*jnrA;
- j3B = 3*jnrB;
-
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
- GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
- GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
-
- dx = _mm_sub_pd(ix, jx);
- dy = _mm_sub_pd(iy, jy);
- dz = _mm_sub_pd(iz, jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
- r = _mm_mul_pd(rsq, rinv);
-
- /* Compute raj_inv aj1-4 */
- raj_inv = gmx_mm_inv_pd(raj);
-
- /* Evaluate influence of atom aj -> ai */
- t1 = _mm_add_pd(r, sk_aj);
- t2 = _mm_sub_pd(r, sk_aj);
- t3 = _mm_sub_pd(sk_aj, r);
- obc_mask1 = _mm_cmplt_pd(rai, t1);
- obc_mask2 = _mm_cmplt_pd(rai, t2);
- obc_mask3 = _mm_cmplt_pd(rai, t3);
-
- uij = gmx_mm_inv_pd(t1);
- lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
- _mm_andnot_pd(obc_mask2, rai_inv));
- dlij = _mm_and_pd(one, obc_mask2);
- uij2 = _mm_mul_pd(uij, uij);
- uij3 = _mm_mul_pd(uij2, uij);
- lij2 = _mm_mul_pd(lij, lij);
- lij3 = _mm_mul_pd(lij2, lij);
-
- diff2 = _mm_sub_pd(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_pd(lij2);
- sk2_aj = _mm_mul_pd(sk_aj, sk_aj);
- sk2_rinv = _mm_mul_pd(sk2_aj, rinv);
- prod = _mm_mul_pd(onefourth, sk2_rinv);
-
- logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
-
- t1 = _mm_sub_pd(lij, uij);
- t2 = _mm_mul_pd(diff2,
- _mm_sub_pd(_mm_mul_pd(onefourth, r),
- prod));
- t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
- t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
- t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
- t4 = _mm_and_pd(t4, obc_mask3);
- t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
-
- sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
-
- t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
- _mm_mul_pd(prod, lij3));
- t1 = _mm_sub_pd(t1,
- _mm_mul_pd(onefourth,
- _mm_add_pd(_mm_mul_pd(lij, rinv),
- _mm_mul_pd(lij3, r))));
- t2 = _mm_mul_pd(onefourth,
- _mm_add_pd(_mm_mul_pd(uij, rinv),
- _mm_mul_pd(uij3, r)));
- t2 = _mm_sub_pd(t2,
- _mm_add_pd(_mm_mul_pd(half, uij2),
- _mm_mul_pd(prod, uij3)));
- t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
- _mm_mul_pd(rinv, rinv));
- t3 = _mm_sub_pd(t3,
- _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
- _mm_add_pd(one,
- _mm_mul_pd(sk2_rinv, rinv))));
- t1 = _mm_mul_pd(rinv,
- _mm_add_pd(_mm_mul_pd(dlij, t1),
- _mm_add_pd(t2, t3)));
-
- dadx1 = _mm_and_pd(t1, obc_mask1);
-
- /* Evaluate influence of atom ai -> aj */
- t1 = _mm_add_pd(r, sk_ai);
- t2 = _mm_sub_pd(r, sk_ai);
- t3 = _mm_sub_pd(sk_ai, r);
- obc_mask1 = _mm_cmplt_pd(raj, t1);
- obc_mask2 = _mm_cmplt_pd(raj, t2);
- obc_mask3 = _mm_cmplt_pd(raj, t3);
-
- uij = gmx_mm_inv_pd(t1);
- lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
- _mm_andnot_pd(obc_mask2, raj_inv));
- dlij = _mm_and_pd(one, obc_mask2);
- uij2 = _mm_mul_pd(uij, uij);
- uij3 = _mm_mul_pd(uij2, uij);
- lij2 = _mm_mul_pd(lij, lij);
- lij3 = _mm_mul_pd(lij2, lij);
-
- diff2 = _mm_sub_pd(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_pd(lij2);
- sk2_rinv = _mm_mul_pd(sk2_ai, rinv);
- prod = _mm_mul_pd(onefourth, sk2_rinv);
-
- logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
-
- t1 = _mm_sub_pd(lij, uij);
- t2 = _mm_mul_pd(diff2,
- _mm_sub_pd(_mm_mul_pd(onefourth, r),
- prod));
- t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
- t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
- t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
- t4 = _mm_and_pd(t4, obc_mask3);
- t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
-
- GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
-
- t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
- _mm_mul_pd(prod, lij3));
- t1 = _mm_sub_pd(t1,
- _mm_mul_pd(onefourth,
- _mm_add_pd(_mm_mul_pd(lij, rinv),
- _mm_mul_pd(lij3, r))));
- t2 = _mm_mul_pd(onefourth,
- _mm_add_pd(_mm_mul_pd(uij, rinv),
- _mm_mul_pd(uij3, r)));
- t2 = _mm_sub_pd(t2,
- _mm_add_pd(_mm_mul_pd(half, uij2),
- _mm_mul_pd(prod, uij3)));
- t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
- _mm_mul_pd(rinv, rinv));
- t3 = _mm_sub_pd(t3,
- _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
- _mm_add_pd(one,
- _mm_mul_pd(sk2_rinv, rinv))));
- t1 = _mm_mul_pd(rinv,
- _mm_add_pd(_mm_mul_pd(dlij, t1),
- _mm_add_pd(t2, t3)));
-
- dadx2 = _mm_and_pd(t1, obc_mask1);
-
- _mm_store_pd(dadx, dadx1);
- dadx += 2;
- _mm_store_pd(dadx, dadx2);
- dadx += 2;
- } /* end normal inner loop */
-
- if (k < nj1)
- {
- jnrA = jjnr[k];
-
- j3A = 3*jnrA;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
- GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
- GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
-
- dx = _mm_sub_sd(ix, jx);
- dy = _mm_sub_sd(iy, jy);
- dz = _mm_sub_sd(iz, jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
- r = _mm_mul_sd(rsq, rinv);
-
- /* Compute raj_inv aj1-4 */
- raj_inv = gmx_mm_inv_pd(raj);
-
- /* Evaluate influence of atom aj -> ai */
- t1 = _mm_add_sd(r, sk_aj);
- t2 = _mm_sub_sd(r, sk_aj);
- t3 = _mm_sub_sd(sk_aj, r);
- obc_mask1 = _mm_cmplt_sd(rai, t1);
- obc_mask2 = _mm_cmplt_sd(rai, t2);
- obc_mask3 = _mm_cmplt_sd(rai, t3);
-
- uij = gmx_mm_inv_pd(t1);
- lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
- _mm_andnot_pd(obc_mask2, rai_inv));
- dlij = _mm_and_pd(one, obc_mask2);
- uij2 = _mm_mul_sd(uij, uij);
- uij3 = _mm_mul_sd(uij2, uij);
- lij2 = _mm_mul_sd(lij, lij);
- lij3 = _mm_mul_sd(lij2, lij);
-
- diff2 = _mm_sub_sd(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_pd(lij2);
- sk2_aj = _mm_mul_sd(sk_aj, sk_aj);
- sk2_rinv = _mm_mul_sd(sk2_aj, rinv);
- prod = _mm_mul_sd(onefourth, sk2_rinv);
-
- logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
-
- t1 = _mm_sub_sd(lij, uij);
- t2 = _mm_mul_sd(diff2,
- _mm_sub_sd(_mm_mul_pd(onefourth, r),
- prod));
- t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
- t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
- t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
- t4 = _mm_and_pd(t4, obc_mask3);
- t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
-
- sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
-
- t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
- _mm_mul_sd(prod, lij3));
- t1 = _mm_sub_sd(t1,
- _mm_mul_sd(onefourth,
- _mm_add_sd(_mm_mul_sd(lij, rinv),
- _mm_mul_sd(lij3, r))));
- t2 = _mm_mul_sd(onefourth,
- _mm_add_sd(_mm_mul_sd(uij, rinv),
- _mm_mul_sd(uij3, r)));
- t2 = _mm_sub_sd(t2,
- _mm_add_sd(_mm_mul_sd(half, uij2),
- _mm_mul_sd(prod, uij3)));
- t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
- _mm_mul_sd(rinv, rinv));
- t3 = _mm_sub_sd(t3,
- _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
- _mm_add_sd(one,
- _mm_mul_sd(sk2_rinv, rinv))));
- t1 = _mm_mul_sd(rinv,
- _mm_add_sd(_mm_mul_sd(dlij, t1),
- _mm_add_pd(t2, t3)));
-
- dadx1 = _mm_and_pd(t1, obc_mask1);
-
- /* Evaluate influence of atom ai -> aj */
- t1 = _mm_add_sd(r, sk_ai);
- t2 = _mm_sub_sd(r, sk_ai);
- t3 = _mm_sub_sd(sk_ai, r);
- obc_mask1 = _mm_cmplt_sd(raj, t1);
- obc_mask2 = _mm_cmplt_sd(raj, t2);
- obc_mask3 = _mm_cmplt_sd(raj, t3);
-
- uij = gmx_mm_inv_pd(t1);
- lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
- _mm_andnot_pd(obc_mask2, raj_inv));
- dlij = _mm_and_pd(one, obc_mask2);
- uij2 = _mm_mul_sd(uij, uij);
- uij3 = _mm_mul_sd(uij2, uij);
- lij2 = _mm_mul_sd(lij, lij);
- lij3 = _mm_mul_sd(lij2, lij);
-
- diff2 = _mm_sub_sd(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_pd(lij2);
- sk2_rinv = _mm_mul_sd(sk2_ai, rinv);
- prod = _mm_mul_sd(onefourth, sk2_rinv);
-
- logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
-
- t1 = _mm_sub_sd(lij, uij);
- t2 = _mm_mul_sd(diff2,
- _mm_sub_sd(_mm_mul_sd(onefourth, r),
- prod));
- t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
- t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
- t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
- t4 = _mm_and_pd(t4, obc_mask3);
- t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
-
- GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
-
- t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
- _mm_mul_sd(prod, lij3));
- t1 = _mm_sub_sd(t1,
- _mm_mul_sd(onefourth,
- _mm_add_sd(_mm_mul_sd(lij, rinv),
- _mm_mul_sd(lij3, r))));
- t2 = _mm_mul_sd(onefourth,
- _mm_add_sd(_mm_mul_sd(uij, rinv),
- _mm_mul_sd(uij3, r)));
- t2 = _mm_sub_sd(t2,
- _mm_add_sd(_mm_mul_sd(half, uij2),
- _mm_mul_sd(prod, uij3)));
- t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
- _mm_mul_sd(rinv, rinv));
- t3 = _mm_sub_sd(t3,
- _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
- _mm_add_sd(one,
- _mm_mul_sd(sk2_rinv, rinv))));
- t1 = _mm_mul_sd(rinv,
- _mm_add_sd(_mm_mul_sd(dlij, t1),
- _mm_add_sd(t2, t3)));
-
- dadx2 = _mm_and_pd(t1, obc_mask1);
-
- _mm_store_pd(dadx, dadx1);
- dadx += 2;
- _mm_store_pd(dadx, dadx2);
- dadx += 2;
- }
- gmx_mm_update_1pot_pd(sum_ai, work+ii);
-
- }
-
- /* Parallel summations */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_sum_real(cr->dd, work);
- }
-
- if (gb_algorithm == egbHCT)
- {
- /* HCT */
- for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
- {
- if (born->use[i] != 0)
- {
- rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
- sum = 1.0/rr - work[i];
- min_rad = rr + doffset;
- rad = 1.0/sum;
-
- born->bRad[i] = rad > min_rad ? rad : min_rad;
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
- }
- }
-
- /* Extra communication required for DD */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_spread_real(cr->dd, born->bRad);
- dd_atom_spread_real(cr->dd, fr->invsqrta);
- }
- }
- else
- {
- /* OBC */
- for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
- {
- if (born->use[i] != 0)
- {
- rr = top->atomtypes.gb_radius[md->typeA[i]];
- rr_inv2 = 1.0/rr;
- rr = rr-doffset;
- rr_inv = 1.0/rr;
- sum = rr * work[i];
- sum2 = sum * sum;
- sum3 = sum2 * sum;
-
- tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
- born->bRad[i] = rr_inv - tsum*rr_inv2;
- born->bRad[i] = 1.0 / born->bRad[i];
-
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-
- tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
- born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
- }
- }
- /* Extra (local) communication required for DD */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_spread_real(cr->dd, born->bRad);
- dd_atom_spread_real(cr->dd, fr->invsqrta);
- dd_atom_spread_real(cr->dd, born->drobc);
- }
- }
-
-
-
- return 0;
-}
-
-
-int
-calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
- double *x, double *f, double *fshift, double *shiftvec,
- int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
-{
- int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
- int jnrA, jnrB;
- int j3A, j3B;
- int * jjnr;
-
- double rbi, shX, shY, shZ;
- double *rb;
-
- __m128d ix, iy, iz;
- __m128d jx, jy, jz;
- __m128d fix, fiy, fiz;
- __m128d dx, dy, dz;
- __m128d tx, ty, tz;
-
- __m128d rbai, rbaj, f_gb, f_gb_ai;
- __m128d xmm1, xmm2, xmm3;
-
- const __m128d two = _mm_set1_pd(2.0);
-
- rb = born->work;
-
- jjnr = nl->jjnr;
-
- /* Loop to get the proper form for the Born radius term, sse style */
- n0 = 0;
- n1 = natoms;
-
- if (gb_algorithm == egbSTILL)
- {
- for (i = n0; i < n1; i++)
- {
- rbi = born->bRad[i];
- rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
- }
- }
- else if (gb_algorithm == egbHCT)
- {
- for (i = n0; i < n1; i++)
- {
- rbi = born->bRad[i];
- rb[i] = rbi * rbi * dvda[i];
- }
- }
- else if (gb_algorithm == egbOBC)
- {
- for (i = n0; i < n1; i++)
- {
- rbi = born->bRad[i];
- rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
- }
- }
-
- jz = _mm_setzero_pd();
-
- n = j3A = j3B = 0;
-
- for (i = 0; i < nl->nri; i++)
- {
- ii = nl->iinr[i];
- ii3 = ii*3;
- is3 = 3*nl->shift[i];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = nl->jindex[i];
- nj1 = nl->jindex[i+1];
-
- ix = _mm_set1_pd(shX+x[ii3+0]);
- iy = _mm_set1_pd(shY+x[ii3+1]);
- iz = _mm_set1_pd(shZ+x[ii3+2]);
-
- rbai = _mm_load1_pd(rb+ii);
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
-
-
- for (k = nj0; k < nj1-1; k += 2)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
-
- j3A = 3*jnrA;
- j3B = 3*jnrB;
-
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
-
- dx = _mm_sub_pd(ix, jx);
- dy = _mm_sub_pd(iy, jy);
- dz = _mm_sub_pd(iz, jz);
-
- GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
-
- /* load chain rule terms for j1-4 */
- f_gb = _mm_load_pd(dadx);
- dadx += 2;
- f_gb_ai = _mm_load_pd(dadx);
- dadx += 2;
-
- /* calculate scalar force */
- f_gb = _mm_mul_pd(f_gb, rbai);
- f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
- f_gb = _mm_add_pd(f_gb, f_gb_ai);
-
- tx = _mm_mul_pd(f_gb, dx);
- ty = _mm_mul_pd(f_gb, dy);
- tz = _mm_mul_pd(f_gb, dz);
-
- fix = _mm_add_pd(fix, tx);
- fiy = _mm_add_pd(fiy, ty);
- fiz = _mm_add_pd(fiz, tz);
-
- GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
- }
-
- /*deal with odd elements */
- if (k < nj1)
- {
- jnrA = jjnr[k];
- j3A = 3*jnrA;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
-
- dx = _mm_sub_sd(ix, jx);
- dy = _mm_sub_sd(iy, jy);
- dz = _mm_sub_sd(iz, jz);
-
- GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
-
- /* load chain rule terms */
- f_gb = _mm_load_pd(dadx);
- dadx += 2;
- f_gb_ai = _mm_load_pd(dadx);
- dadx += 2;
-
- /* calculate scalar force */
- f_gb = _mm_mul_sd(f_gb, rbai);
- f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
- f_gb = _mm_add_sd(f_gb, f_gb_ai);
-
- tx = _mm_mul_sd(f_gb, dx);
- ty = _mm_mul_sd(f_gb, dy);
- tz = _mm_mul_sd(f_gb, dz);
-
- fix = _mm_add_sd(fix, tx);
- fiy = _mm_add_sd(fiy, ty);
- fiz = _mm_add_sd(fiz, tz);
-
- GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
- }
-
- /* fix/fiy/fiz now contain four partial force terms, that all should be
- * added to the i particle forces and shift forces.
- */
- gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
- }
-
- return 0;
-}
-
-#else
-/* keep compiler happy */
-int genborn_sse2_dummy;
-
-#endif /* SSE2 intrinsics available */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _genborn_sse2_double_h
-#define _genborn_sse2_double_h
-
-#include "gromacs/legacyheaders/typedefs.h"
-
-int
-calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
- double *x, t_nblist *nl, gmx_genborn_t *born);
-
-int
-calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f,
- double *fshift, double *shift_vec, int gb_algorithm,
- gmx_genborn_t *born, t_mdatoms *md);
-
-int
-calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
- double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm);
-
-#endif /* _genborn_sse2_double_h */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include <math.h>
-#include <string.h>
-
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/fileio/pdbio.h"
-#include "gromacs/legacyheaders/genborn.h"
-#include "gromacs/legacyheaders/names.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/smalloc.h"
-
-
-/* Only compile this file if SSE intrinsics are available */
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-
-#include "genborn_sse2_single.h"
-
-#include <emmintrin.h>
-#include <gmx_sse2_single.h>
-
-
-int
-calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
- int natoms, gmx_localtop_t *top,
- float *x, t_nblist *nl,
- gmx_genborn_t *born)
-{
- int i, k, n, ii, is3, ii3, nj0, nj1, offset;
- int jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
- int jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
- int shift;
- int *mdtype;
- real shX, shY, shZ;
- int *jjnr;
- real *shiftvec;
-
- float gpi_ai, gpi2;
- float factor;
- float *gb_radius;
- float *vsolv;
- float *work;
- float *dadx;
-
- __m128 ix, iy, iz;
- __m128 jx, jy, jz;
- __m128 dx, dy, dz;
- __m128 tx, ty, tz;
- __m128 jxB, jyB, jzB;
- __m128 dxB, dyB, dzB;
- __m128 txB, tyB, tzB;
- __m128 rsq, rinv, rinv2, rinv4, rinv6;
- __m128 rsqB, rinvB, rinv2B, rinv4B, rinv6B;
- __m128 ratio, gpi, rai, raj, vai, vaj, rvdw;
- __m128 ratioB, rajB, vajB, rvdwB;
- __m128 ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
- __m128 ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
- __m128 mask, icf4, icf6, mask_cmp;
- __m128 icf4B, icf6B, mask_cmpB;
-
- __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
- __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
- __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
-
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 three = _mm_set1_ps(3.0f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
- const __m128 zero = _mm_set1_ps(0.0f);
- const __m128 four = _mm_set1_ps(4.0f);
-
- const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
- const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
- const __m128 still_p4 = _mm_set1_ps(STILL_P4);
-
- factor = 0.5 * ONE_4PI_EPS0;
-
- gb_radius = born->gb_radius;
- vsolv = born->vsolv;
- work = born->gpol_still_work;
- jjnr = nl->jjnr;
- shiftvec = fr->shift_vec[0];
- dadx = fr->dadx;
-
- jnrA = jnrB = jnrC = jnrD = 0;
- jx = _mm_setzero_ps();
- jy = _mm_setzero_ps();
- jz = _mm_setzero_ps();
-
- n = 0;
-
- for (i = 0; i < natoms; i++)
- {
- work[i] = 0;
- }
-
- for (i = 0; i < nl->nri; i++)
- {
- ii = nl->iinr[i];
- ii3 = ii*3;
- is3 = 3*nl->shift[i];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = nl->jindex[i];
- nj1 = nl->jindex[i+1];
-
- ix = _mm_set1_ps(shX+x[ii3+0]);
- iy = _mm_set1_ps(shY+x[ii3+1]);
- iz = _mm_set1_ps(shZ+x[ii3+2]);
-
- offset = (nj1-nj0)%4;
-
- /* Polarization energy for atom ai */
- gpi = _mm_setzero_ps();
-
- rai = _mm_load1_ps(gb_radius+ii);
- prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
-
- for (k = nj0; k < nj1-4-offset; k += 8)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- jnrC = jjnr[k+2];
- jnrD = jjnr[k+3];
- jnrE = jjnr[k+4];
- jnrF = jjnr[k+5];
- jnrG = jjnr[k+6];
- jnrH = jjnr[k+7];
-
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- j3C = 3*jnrC;
- j3D = 3*jnrD;
- j3E = 3*jnrE;
- j3F = 3*jnrF;
- j3G = 3*jnrG;
- j3H = 3*jnrH;
-
- GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
- GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
-
- GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
- GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
- GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
- GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
-
- dx = _mm_sub_ps(ix, jx);
- dy = _mm_sub_ps(iy, jy);
- dz = _mm_sub_ps(iz, jz);
- dxB = _mm_sub_ps(ix, jxB);
- dyB = _mm_sub_ps(iy, jyB);
- dzB = _mm_sub_ps(iz, jzB);
-
- rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
- rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
- rinv = gmx_mm_invsqrt_ps(rsq);
- rinvB = gmx_mm_invsqrt_ps(rsqB);
- rinv2 = _mm_mul_ps(rinv, rinv);
- rinv2B = _mm_mul_ps(rinvB, rinvB);
- rinv4 = _mm_mul_ps(rinv2, rinv2);
- rinv4B = _mm_mul_ps(rinv2B, rinv2B);
- rinv6 = _mm_mul_ps(rinv4, rinv2);
- rinv6B = _mm_mul_ps(rinv4B, rinv2B);
-
- rvdw = _mm_add_ps(rai, raj);
- rvdwB = _mm_add_ps(rai, rajB);
- ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
- ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
-
- mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
- mask_cmpB = _mm_cmple_ps(ratioB, still_p5inv);
-
- /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
- if (0 == _mm_movemask_ps(mask_cmp) )
- {
- /* if ratio>still_p5inv for ALL elements */
- ccf = one;
- dccf = _mm_setzero_ps();
- }
- else
- {
- ratio = _mm_min_ps(ratio, still_p5inv);
- theta = _mm_mul_ps(ratio, still_pip5);
- gmx_mm_sincos_ps(theta, &sinq, &cosq);
- term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
- ccf = _mm_mul_ps(term, term);
- dccf = _mm_mul_ps(_mm_mul_ps(two, term),
- _mm_mul_ps(sinq, theta));
- }
- if (0 == _mm_movemask_ps(mask_cmpB) )
- {
- /* if ratio>still_p5inv for ALL elements */
- ccfB = one;
- dccfB = _mm_setzero_ps();
- }
- else
- {
- ratioB = _mm_min_ps(ratioB, still_p5inv);
- thetaB = _mm_mul_ps(ratioB, still_pip5);
- gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
- termB = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
- ccfB = _mm_mul_ps(termB, termB);
- dccfB = _mm_mul_ps(_mm_mul_ps(two, termB),
- _mm_mul_ps(sinqB, thetaB));
- }
-
- prod = _mm_mul_ps(still_p4, vaj);
- prodB = _mm_mul_ps(still_p4, vajB);
- icf4 = _mm_mul_ps(ccf, rinv4);
- icf4B = _mm_mul_ps(ccfB, rinv4B);
- icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
- icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
-
- GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
- GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
-
- gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
-
- _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
- dadx += 4;
- }
-
- for (; k < nj1-offset; k += 4)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- jnrC = jjnr[k+2];
- jnrD = jjnr[k+3];
-
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- j3C = 3*jnrC;
- j3D = 3*jnrD;
-
- GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
-
- GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
- GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
-
- dx = _mm_sub_ps(ix, jx);
- dy = _mm_sub_ps(iy, jy);
- dz = _mm_sub_ps(iz, jz);
-
- rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
- rinv = gmx_mm_invsqrt_ps(rsq);
- rinv2 = _mm_mul_ps(rinv, rinv);
- rinv4 = _mm_mul_ps(rinv2, rinv2);
- rinv6 = _mm_mul_ps(rinv4, rinv2);
-
- rvdw = _mm_add_ps(rai, raj);
- ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
-
- mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
-
- /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
- if (0 == _mm_movemask_ps(mask_cmp))
- {
- /* if ratio>still_p5inv for ALL elements */
- ccf = one;
- dccf = _mm_setzero_ps();
- }
- else
- {
- ratio = _mm_min_ps(ratio, still_p5inv);
- theta = _mm_mul_ps(ratio, still_pip5);
- gmx_mm_sincos_ps(theta, &sinq, &cosq);
- term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
- ccf = _mm_mul_ps(term, term);
- dccf = _mm_mul_ps(_mm_mul_ps(two, term),
- _mm_mul_ps(sinq, theta));
- }
-
- prod = _mm_mul_ps(still_p4, vaj);
- icf4 = _mm_mul_ps(ccf, rinv4);
- icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
-
- GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
-
- gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
-
- _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
- dadx += 4;
- }
-
- if (offset != 0)
- {
- if (offset == 1)
- {
- jnrA = jjnr[k];
- j3A = 3*jnrA;
- GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
- GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
- GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
- mask = mask1;
- }
- else if (offset == 2)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
- GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
- GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
- mask = mask2;
- }
- else
- {
- /* offset must be 3 */
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- jnrC = jjnr[k+2];
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- j3C = 3*jnrC;
- GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
- GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
- GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
- mask = mask3;
- }
-
- dx = _mm_sub_ps(ix, jx);
- dy = _mm_sub_ps(iy, jy);
- dz = _mm_sub_ps(iz, jz);
-
- rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
- rinv = gmx_mm_invsqrt_ps(rsq);
- rinv2 = _mm_mul_ps(rinv, rinv);
- rinv4 = _mm_mul_ps(rinv2, rinv2);
- rinv6 = _mm_mul_ps(rinv4, rinv2);
-
- rvdw = _mm_add_ps(rai, raj);
- ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
-
- mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
-
- if (0 == _mm_movemask_ps(mask_cmp))
- {
- /* if ratio>still_p5inv for ALL elements */
- ccf = one;
- dccf = _mm_setzero_ps();
- }
- else
- {
- ratio = _mm_min_ps(ratio, still_p5inv);
- theta = _mm_mul_ps(ratio, still_pip5);
- gmx_mm_sincos_ps(theta, &sinq, &cosq);
- term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
- ccf = _mm_mul_ps(term, term);
- dccf = _mm_mul_ps(_mm_mul_ps(two, term),
- _mm_mul_ps(sinq, theta));
- }
-
- prod = _mm_mul_ps(still_p4, vaj);
- icf4 = _mm_mul_ps(ccf, rinv4);
- icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
-
- gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
-
- _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
- dadx += 4;
- _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
- dadx += 4;
-
- tmp = _mm_mul_ps(prod_ai, icf4);
-
- if (offset == 1)
- {
- GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
- }
- else if (offset == 2)
- {
- GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
- }
- else
- {
- /* offset must be 3 */
- GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
- }
- }
- GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
- }
-
- /* Sum up the polarization energy from other nodes */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_sum_real(cr->dd, work);
- }
-
- /* Compute the radii */
- for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
- {
- if (born->use[i] != 0)
- {
- gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
- gpi2 = gpi_ai * gpi_ai;
- born->bRad[i] = factor*gmx_invsqrt(gpi2);
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
- }
- }
-
- /* Extra (local) communication required for DD */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_spread_real(cr->dd, born->bRad);
- dd_atom_spread_real(cr->dd, fr->invsqrta);
- }
-
- return 0;
-}
-
-
-int
-calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
- float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
-{
- int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
- int jnrA, jnrB, jnrC, jnrD;
- int j3A, j3B, j3C, j3D;
- int jnrE, jnrF, jnrG, jnrH;
- int j3E, j3F, j3G, j3H;
- float shX, shY, shZ;
- float rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
- float sum_ai2, sum_ai3, tsum, tchain, doffset;
- float *obc_param;
- float *gb_radius;
- float *work;
- int * jjnr;
- float *dadx;
- float *shiftvec;
- float min_rad, rad;
-
- __m128 ix, iy, iz, jx, jy, jz;
- __m128 dx, dy, dz, t1, t2, t3, t4;
- __m128 rsq, rinv, r;
- __m128 rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
- __m128 uij, lij2, uij2, lij3, uij3, diff2;
- __m128 lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
- __m128 sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
- __m128 dadx1, dadx2;
- __m128 logterm;
- __m128 mask;
- __m128 obc_mask1, obc_mask2, obc_mask3;
- __m128 jxB, jyB, jzB, t1B, t2B, t3B, t4B;
- __m128 dxB, dyB, dzB, rsqB, rinvB, rB;
- __m128 rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
- __m128 uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
- __m128 lij_invB, sk2_invB, prodB;
- __m128 sk_ajB, sk2_ajB, sk2_rinvB;
- __m128 dadx1B, dadx2B;
- __m128 logtermB;
- __m128 obc_mask1B, obc_mask2B, obc_mask3B;
-
- __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
- __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
- __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
-
- __m128 oneeighth = _mm_set1_ps(0.125);
- __m128 onefourth = _mm_set1_ps(0.25);
-
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 three = _mm_set1_ps(3.0f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
- const __m128 zero = _mm_set1_ps(0.0f);
- const __m128 neg = _mm_set1_ps(-1.0f);
-
- /* Set the dielectric offset */
- doffset = born->gb_doffset;
- gb_radius = born->gb_radius;
- obc_param = born->param;
- work = born->gpol_hct_work;
- jjnr = nl->jjnr;
- dadx = fr->dadx;
- shiftvec = fr->shift_vec[0];
-
- jx = _mm_setzero_ps();
- jy = _mm_setzero_ps();
- jz = _mm_setzero_ps();
-
- jnrA = jnrB = jnrC = jnrD = 0;
-
- for (i = 0; i < born->nr; i++)
- {
- work[i] = 0;
- }
-
- for (i = 0; i < nl->nri; i++)
- {
- ii = nl->iinr[i];
- ii3 = ii*3;
- is3 = 3*nl->shift[i];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = nl->jindex[i];
- nj1 = nl->jindex[i+1];
-
- ix = _mm_set1_ps(shX+x[ii3+0]);
- iy = _mm_set1_ps(shY+x[ii3+1]);
- iz = _mm_set1_ps(shZ+x[ii3+2]);
-
- offset = (nj1-nj0)%4;
-
- rai = _mm_load1_ps(gb_radius+ii);
- rai_inv = gmx_mm_inv_ps(rai);
-
- sum_ai = _mm_setzero_ps();
-
- sk_ai = _mm_load1_ps(born->param+ii);
- sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
-
- for (k = nj0; k < nj1-4-offset; k += 8)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- jnrC = jjnr[k+2];
- jnrD = jjnr[k+3];
- jnrE = jjnr[k+4];
- jnrF = jjnr[k+5];
- jnrG = jjnr[k+6];
- jnrH = jjnr[k+7];
-
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- j3C = 3*jnrC;
- j3D = 3*jnrD;
- j3E = 3*jnrE;
- j3F = 3*jnrF;
- j3G = 3*jnrG;
- j3H = 3*jnrH;
-
- GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
- GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
- GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
- GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
- GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
- GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
-
- dx = _mm_sub_ps(ix, jx);
- dy = _mm_sub_ps(iy, jy);
- dz = _mm_sub_ps(iz, jz);
- dxB = _mm_sub_ps(ix, jxB);
- dyB = _mm_sub_ps(iy, jyB);
- dzB = _mm_sub_ps(iz, jzB);
-
- rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
- rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
-
- rinv = gmx_mm_invsqrt_ps(rsq);
- r = _mm_mul_ps(rsq, rinv);
- rinvB = gmx_mm_invsqrt_ps(rsqB);
- rB = _mm_mul_ps(rsqB, rinvB);
-
- /* Compute raj_inv aj1-4 */
- raj_inv = gmx_mm_inv_ps(raj);
- raj_invB = gmx_mm_inv_ps(rajB);
-
- /* Evaluate influence of atom aj -> ai */
- t1 = _mm_add_ps(r, sk_aj);
- t2 = _mm_sub_ps(r, sk_aj);
- t3 = _mm_sub_ps(sk_aj, r);
- t1B = _mm_add_ps(rB, sk_ajB);
- t2B = _mm_sub_ps(rB, sk_ajB);
- t3B = _mm_sub_ps(sk_ajB, rB);
- obc_mask1 = _mm_cmplt_ps(rai, t1);
- obc_mask2 = _mm_cmplt_ps(rai, t2);
- obc_mask3 = _mm_cmplt_ps(rai, t3);
- obc_mask1B = _mm_cmplt_ps(rai, t1B);
- obc_mask2B = _mm_cmplt_ps(rai, t2B);
- obc_mask3B = _mm_cmplt_ps(rai, t3B);
-
- uij = gmx_mm_inv_ps(t1);
- lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
- _mm_andnot_ps(obc_mask2, rai_inv));
- dlij = _mm_and_ps(one, obc_mask2);
- uij2 = _mm_mul_ps(uij, uij);
- uij3 = _mm_mul_ps(uij2, uij);
- lij2 = _mm_mul_ps(lij, lij);
- lij3 = _mm_mul_ps(lij2, lij);
-
- uijB = gmx_mm_inv_ps(t1B);
- lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
- _mm_andnot_ps(obc_mask2B, rai_inv));
- dlijB = _mm_and_ps(one, obc_mask2B);
- uij2B = _mm_mul_ps(uijB, uijB);
- uij3B = _mm_mul_ps(uij2B, uijB);
- lij2B = _mm_mul_ps(lijB, lijB);
- lij3B = _mm_mul_ps(lij2B, lijB);
-
- diff2 = _mm_sub_ps(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_ps(lij2);
- sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
- sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
- prod = _mm_mul_ps(onefourth, sk2_rinv);
-
- diff2B = _mm_sub_ps(uij2B, lij2B);
- lij_invB = gmx_mm_invsqrt_ps(lij2B);
- sk2_ajB = _mm_mul_ps(sk_ajB, sk_ajB);
- sk2_rinvB = _mm_mul_ps(sk2_ajB, rinvB);
- prodB = _mm_mul_ps(onefourth, sk2_rinvB);
-
- logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
- logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
-
- t1 = _mm_sub_ps(lij, uij);
- t2 = _mm_mul_ps(diff2,
- _mm_sub_ps(_mm_mul_ps(onefourth, r),
- prod));
- t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
- t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
- t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
- t4 = _mm_and_ps(t4, obc_mask3);
- t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-
- t1B = _mm_sub_ps(lijB, uijB);
- t2B = _mm_mul_ps(diff2B,
- _mm_sub_ps(_mm_mul_ps(onefourth, rB),
- prodB));
- t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
- t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
- t4B = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
- t4B = _mm_and_ps(t4B, obc_mask3B);
- t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
-
- sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
-
- t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
- _mm_mul_ps(prod, lij3));
- t1 = _mm_sub_ps(t1,
- _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(lij, rinv),
- _mm_mul_ps(lij3, r))));
- t2 = _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(uij, rinv),
- _mm_mul_ps(uij3, r)));
- t2 = _mm_sub_ps(t2,
- _mm_add_ps(_mm_mul_ps(half, uij2),
- _mm_mul_ps(prod, uij3)));
- t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
- _mm_mul_ps(rinv, rinv));
- t3 = _mm_sub_ps(t3,
- _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
- _mm_add_ps(one,
- _mm_mul_ps(sk2_rinv, rinv))));
- t1 = _mm_mul_ps(rinv,
- _mm_add_ps(_mm_mul_ps(dlij, t1),
- _mm_add_ps(t2, t3)));
-
-
-
- t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
- _mm_mul_ps(prodB, lij3B));
- t1B = _mm_sub_ps(t1B,
- _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(lijB, rinvB),
- _mm_mul_ps(lij3B, rB))));
- t2B = _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(uijB, rinvB),
- _mm_mul_ps(uij3B, rB)));
- t2B = _mm_sub_ps(t2B,
- _mm_add_ps(_mm_mul_ps(half, uij2B),
- _mm_mul_ps(prodB, uij3B)));
- t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
- _mm_mul_ps(rinvB, rinvB));
- t3B = _mm_sub_ps(t3B,
- _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
- _mm_add_ps(one,
- _mm_mul_ps(sk2_rinvB, rinvB))));
- t1B = _mm_mul_ps(rinvB,
- _mm_add_ps(_mm_mul_ps(dlijB, t1B),
- _mm_add_ps(t2B, t3B)));
-
- dadx1 = _mm_and_ps(t1, obc_mask1);
- dadx1B = _mm_and_ps(t1B, obc_mask1B);
-
-
- /* Evaluate influence of atom ai -> aj */
- t1 = _mm_add_ps(r, sk_ai);
- t2 = _mm_sub_ps(r, sk_ai);
- t3 = _mm_sub_ps(sk_ai, r);
- t1B = _mm_add_ps(rB, sk_ai);
- t2B = _mm_sub_ps(rB, sk_ai);
- t3B = _mm_sub_ps(sk_ai, rB);
- obc_mask1 = _mm_cmplt_ps(raj, t1);
- obc_mask2 = _mm_cmplt_ps(raj, t2);
- obc_mask3 = _mm_cmplt_ps(raj, t3);
- obc_mask1B = _mm_cmplt_ps(rajB, t1B);
- obc_mask2B = _mm_cmplt_ps(rajB, t2B);
- obc_mask3B = _mm_cmplt_ps(rajB, t3B);
-
- uij = gmx_mm_inv_ps(t1);
- lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
- _mm_andnot_ps(obc_mask2, raj_inv));
- dlij = _mm_and_ps(one, obc_mask2);
- uij2 = _mm_mul_ps(uij, uij);
- uij3 = _mm_mul_ps(uij2, uij);
- lij2 = _mm_mul_ps(lij, lij);
- lij3 = _mm_mul_ps(lij2, lij);
-
- uijB = gmx_mm_inv_ps(t1B);
- lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
- _mm_andnot_ps(obc_mask2B, raj_invB));
- dlijB = _mm_and_ps(one, obc_mask2B);
- uij2B = _mm_mul_ps(uijB, uijB);
- uij3B = _mm_mul_ps(uij2B, uijB);
- lij2B = _mm_mul_ps(lijB, lijB);
- lij3B = _mm_mul_ps(lij2B, lijB);
-
- diff2 = _mm_sub_ps(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_ps(lij2);
- sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
- prod = _mm_mul_ps(onefourth, sk2_rinv);
-
- diff2B = _mm_sub_ps(uij2B, lij2B);
- lij_invB = gmx_mm_invsqrt_ps(lij2B);
- sk2_rinvB = _mm_mul_ps(sk2_ai, rinvB);
- prodB = _mm_mul_ps(onefourth, sk2_rinvB);
-
- logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
- logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
-
- t1 = _mm_sub_ps(lij, uij);
- t2 = _mm_mul_ps(diff2,
- _mm_sub_ps(_mm_mul_ps(onefourth, r),
- prod));
- t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
- t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
- t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
- t4 = _mm_and_ps(t4, obc_mask3);
- t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-
- t1B = _mm_sub_ps(lijB, uijB);
- t2B = _mm_mul_ps(diff2B,
- _mm_sub_ps(_mm_mul_ps(onefourth, rB),
- prodB));
- t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
- t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
- t4B = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
- t4B = _mm_and_ps(t4B, obc_mask3B);
- t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
-
- GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
- GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
-
- t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
- _mm_mul_ps(prod, lij3));
- t1 = _mm_sub_ps(t1,
- _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(lij, rinv),
- _mm_mul_ps(lij3, r))));
- t2 = _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(uij, rinv),
- _mm_mul_ps(uij3, r)));
- t2 = _mm_sub_ps(t2,
- _mm_add_ps(_mm_mul_ps(half, uij2),
- _mm_mul_ps(prod, uij3)));
- t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
- _mm_mul_ps(rinv, rinv));
- t3 = _mm_sub_ps(t3,
- _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
- _mm_add_ps(one,
- _mm_mul_ps(sk2_rinv, rinv))));
- t1 = _mm_mul_ps(rinv,
- _mm_add_ps(_mm_mul_ps(dlij, t1),
- _mm_add_ps(t2, t3)));
-
-
- t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
- _mm_mul_ps(prodB, lij3B));
- t1B = _mm_sub_ps(t1B,
- _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(lijB, rinvB),
- _mm_mul_ps(lij3B, rB))));
- t2B = _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(uijB, rinvB),
- _mm_mul_ps(uij3B, rB)));
- t2B = _mm_sub_ps(t2B,
- _mm_add_ps(_mm_mul_ps(half, uij2B),
- _mm_mul_ps(prodB, uij3B)));
- t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
- _mm_mul_ps(rinvB, rinvB));
- t3B = _mm_sub_ps(t3B,
- _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
- _mm_add_ps(one,
- _mm_mul_ps(sk2_rinvB, rinvB))));
- t1B = _mm_mul_ps(rinvB,
- _mm_add_ps(_mm_mul_ps(dlijB, t1B),
- _mm_add_ps(t2B, t3B)));
-
-
- dadx2 = _mm_and_ps(t1, obc_mask1);
- dadx2B = _mm_and_ps(t1B, obc_mask1B);
-
- _mm_store_ps(dadx, dadx1);
- dadx += 4;
- _mm_store_ps(dadx, dadx2);
- dadx += 4;
- _mm_store_ps(dadx, dadx1B);
- dadx += 4;
- _mm_store_ps(dadx, dadx2B);
- dadx += 4;
-
- } /* end normal inner loop */
-
- for (; k < nj1-offset; k += 4)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- jnrC = jjnr[k+2];
- jnrD = jjnr[k+3];
-
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- j3C = 3*jnrC;
- j3D = 3*jnrD;
-
- GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
- GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
- GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
-
- dx = _mm_sub_ps(ix, jx);
- dy = _mm_sub_ps(iy, jy);
- dz = _mm_sub_ps(iz, jz);
-
- rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
-
- rinv = gmx_mm_invsqrt_ps(rsq);
- r = _mm_mul_ps(rsq, rinv);
-
- /* Compute raj_inv aj1-4 */
- raj_inv = gmx_mm_inv_ps(raj);
-
- /* Evaluate influence of atom aj -> ai */
- t1 = _mm_add_ps(r, sk_aj);
- obc_mask1 = _mm_cmplt_ps(rai, t1);
-
- if (_mm_movemask_ps(obc_mask1))
- {
- /* If any of the elements has rai<dr+sk, this is executed */
- t2 = _mm_sub_ps(r, sk_aj);
- t3 = _mm_sub_ps(sk_aj, r);
-
- obc_mask2 = _mm_cmplt_ps(rai, t2);
- obc_mask3 = _mm_cmplt_ps(rai, t3);
-
- uij = gmx_mm_inv_ps(t1);
- lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
- _mm_andnot_ps(obc_mask2, rai_inv));
- dlij = _mm_and_ps(one, obc_mask2);
- uij2 = _mm_mul_ps(uij, uij);
- uij3 = _mm_mul_ps(uij2, uij);
- lij2 = _mm_mul_ps(lij, lij);
- lij3 = _mm_mul_ps(lij2, lij);
- diff2 = _mm_sub_ps(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_ps(lij2);
- sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
- sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
- prod = _mm_mul_ps(onefourth, sk2_rinv);
- logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
- t1 = _mm_sub_ps(lij, uij);
- t2 = _mm_mul_ps(diff2,
- _mm_sub_ps(_mm_mul_ps(onefourth, r),
- prod));
- t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
- t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
- t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
- t4 = _mm_and_ps(t4, obc_mask3);
- t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
- sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
- t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
- _mm_mul_ps(prod, lij3));
- t1 = _mm_sub_ps(t1,
- _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(lij, rinv),
- _mm_mul_ps(lij3, r))));
- t2 = _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(uij, rinv),
- _mm_mul_ps(uij3, r)));
- t2 = _mm_sub_ps(t2,
- _mm_add_ps(_mm_mul_ps(half, uij2),
- _mm_mul_ps(prod, uij3)));
- t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
- _mm_mul_ps(rinv, rinv));
- t3 = _mm_sub_ps(t3,
- _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
- _mm_add_ps(one,
- _mm_mul_ps(sk2_rinv, rinv))));
- t1 = _mm_mul_ps(rinv,
- _mm_add_ps(_mm_mul_ps(dlij, t1),
- _mm_add_ps(t2, t3)));
-
- dadx1 = _mm_and_ps(t1, obc_mask1);
- }
- else
- {
- dadx1 = _mm_setzero_ps();
- }
-
- /* Evaluate influence of atom ai -> aj */
- t1 = _mm_add_ps(r, sk_ai);
- obc_mask1 = _mm_cmplt_ps(raj, t1);
-
- if (_mm_movemask_ps(obc_mask1))
- {
- t2 = _mm_sub_ps(r, sk_ai);
- t3 = _mm_sub_ps(sk_ai, r);
- obc_mask2 = _mm_cmplt_ps(raj, t2);
- obc_mask3 = _mm_cmplt_ps(raj, t3);
-
- uij = gmx_mm_inv_ps(t1);
- lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
- _mm_andnot_ps(obc_mask2, raj_inv));
- dlij = _mm_and_ps(one, obc_mask2);
- uij2 = _mm_mul_ps(uij, uij);
- uij3 = _mm_mul_ps(uij2, uij);
- lij2 = _mm_mul_ps(lij, lij);
- lij3 = _mm_mul_ps(lij2, lij);
- diff2 = _mm_sub_ps(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_ps(lij2);
- sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
- prod = _mm_mul_ps(onefourth, sk2_rinv);
- logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
- t1 = _mm_sub_ps(lij, uij);
- t2 = _mm_mul_ps(diff2,
- _mm_sub_ps(_mm_mul_ps(onefourth, r),
- prod));
- t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
- t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
- t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
- t4 = _mm_and_ps(t4, obc_mask3);
- t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-
- GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
-
- t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
- _mm_mul_ps(prod, lij3));
- t1 = _mm_sub_ps(t1,
- _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(lij, rinv),
- _mm_mul_ps(lij3, r))));
- t2 = _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(uij, rinv),
- _mm_mul_ps(uij3, r)));
- t2 = _mm_sub_ps(t2,
- _mm_add_ps(_mm_mul_ps(half, uij2),
- _mm_mul_ps(prod, uij3)));
- t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
- _mm_mul_ps(rinv, rinv));
- t3 = _mm_sub_ps(t3,
- _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
- _mm_add_ps(one,
- _mm_mul_ps(sk2_rinv, rinv))));
- t1 = _mm_mul_ps(rinv,
- _mm_add_ps(_mm_mul_ps(dlij, t1),
- _mm_add_ps(t2, t3)));
- dadx2 = _mm_and_ps(t1, obc_mask1);
- }
- else
- {
- dadx2 = _mm_setzero_ps();
- }
-
- _mm_store_ps(dadx, dadx1);
- dadx += 4;
- _mm_store_ps(dadx, dadx2);
- dadx += 4;
- } /* end normal inner loop */
-
- if (offset != 0)
- {
- if (offset == 1)
- {
- jnrA = jjnr[k];
- j3A = 3*jnrA;
- GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
- GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
- GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
- mask = mask1;
- }
- else if (offset == 2)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
- GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
- GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
- mask = mask2;
- }
- else
- {
- /* offset must be 3 */
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- jnrC = jjnr[k+2];
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- j3C = 3*jnrC;
- GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
- GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
- GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
- mask = mask3;
- }
-
- dx = _mm_sub_ps(ix, jx);
- dy = _mm_sub_ps(iy, jy);
- dz = _mm_sub_ps(iz, jz);
-
- rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
-
- rinv = gmx_mm_invsqrt_ps(rsq);
- r = _mm_mul_ps(rsq, rinv);
-
- /* Compute raj_inv aj1-4 */
- raj_inv = gmx_mm_inv_ps(raj);
-
- /* Evaluate influence of atom aj -> ai */
- t1 = _mm_add_ps(r, sk_aj);
- obc_mask1 = _mm_cmplt_ps(rai, t1);
- obc_mask1 = _mm_and_ps(obc_mask1, mask);
-
- if (_mm_movemask_ps(obc_mask1))
- {
- t2 = _mm_sub_ps(r, sk_aj);
- t3 = _mm_sub_ps(sk_aj, r);
- obc_mask2 = _mm_cmplt_ps(rai, t2);
- obc_mask3 = _mm_cmplt_ps(rai, t3);
-
- uij = gmx_mm_inv_ps(t1);
- lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
- _mm_andnot_ps(obc_mask2, rai_inv));
- dlij = _mm_and_ps(one, obc_mask2);
- uij2 = _mm_mul_ps(uij, uij);
- uij3 = _mm_mul_ps(uij2, uij);
- lij2 = _mm_mul_ps(lij, lij);
- lij3 = _mm_mul_ps(lij2, lij);
- diff2 = _mm_sub_ps(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_ps(lij2);
- sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
- sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
- prod = _mm_mul_ps(onefourth, sk2_rinv);
- logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
- t1 = _mm_sub_ps(lij, uij);
- t2 = _mm_mul_ps(diff2,
- _mm_sub_ps(_mm_mul_ps(onefourth, r),
- prod));
- t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
- t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
- t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
- t4 = _mm_and_ps(t4, obc_mask3);
- t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
- sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
- t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
- _mm_mul_ps(prod, lij3));
- t1 = _mm_sub_ps(t1,
- _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(lij, rinv),
- _mm_mul_ps(lij3, r))));
- t2 = _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(uij, rinv),
- _mm_mul_ps(uij3, r)));
- t2 = _mm_sub_ps(t2,
- _mm_add_ps(_mm_mul_ps(half, uij2),
- _mm_mul_ps(prod, uij3)));
- t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
- _mm_mul_ps(rinv, rinv));
- t3 = _mm_sub_ps(t3,
- _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
- _mm_add_ps(one,
- _mm_mul_ps(sk2_rinv, rinv))));
- t1 = _mm_mul_ps(rinv,
- _mm_add_ps(_mm_mul_ps(dlij, t1),
- _mm_add_ps(t2, t3)));
- dadx1 = _mm_and_ps(t1, obc_mask1);
- }
- else
- {
- dadx1 = _mm_setzero_ps();
- }
-
- /* Evaluate influence of atom ai -> aj */
- t1 = _mm_add_ps(r, sk_ai);
- obc_mask1 = _mm_cmplt_ps(raj, t1);
- obc_mask1 = _mm_and_ps(obc_mask1, mask);
-
- if (_mm_movemask_ps(obc_mask1))
- {
- t2 = _mm_sub_ps(r, sk_ai);
- t3 = _mm_sub_ps(sk_ai, r);
- obc_mask2 = _mm_cmplt_ps(raj, t2);
- obc_mask3 = _mm_cmplt_ps(raj, t3);
-
- uij = gmx_mm_inv_ps(t1);
- lij = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
- _mm_andnot_ps(obc_mask2, raj_inv));
- dlij = _mm_and_ps(one, obc_mask2);
- uij2 = _mm_mul_ps(uij, uij);
- uij3 = _mm_mul_ps(uij2, uij);
- lij2 = _mm_mul_ps(lij, lij);
- lij3 = _mm_mul_ps(lij2, lij);
- diff2 = _mm_sub_ps(uij2, lij2);
- lij_inv = gmx_mm_invsqrt_ps(lij2);
- sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
- prod = _mm_mul_ps(onefourth, sk2_rinv);
- logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
- t1 = _mm_sub_ps(lij, uij);
- t2 = _mm_mul_ps(diff2,
- _mm_sub_ps(_mm_mul_ps(onefourth, r),
- prod));
- t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
- t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
- t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
- t4 = _mm_and_ps(t4, obc_mask3);
- t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-
- tmp = _mm_and_ps(t1, obc_mask1);
-
- t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
- _mm_mul_ps(prod, lij3));
- t1 = _mm_sub_ps(t1,
- _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(lij, rinv),
- _mm_mul_ps(lij3, r))));
- t2 = _mm_mul_ps(onefourth,
- _mm_add_ps(_mm_mul_ps(uij, rinv),
- _mm_mul_ps(uij3, r)));
- t2 = _mm_sub_ps(t2,
- _mm_add_ps(_mm_mul_ps(half, uij2),
- _mm_mul_ps(prod, uij3)));
- t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
- _mm_mul_ps(rinv, rinv));
- t3 = _mm_sub_ps(t3,
- _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
- _mm_add_ps(one,
- _mm_mul_ps(sk2_rinv, rinv))));
- t1 = _mm_mul_ps(rinv,
- _mm_add_ps(_mm_mul_ps(dlij, t1),
- _mm_add_ps(t2, t3)));
- dadx2 = _mm_and_ps(t1, obc_mask1);
- }
- else
- {
- dadx2 = _mm_setzero_ps();
- tmp = _mm_setzero_ps();
- }
-
- _mm_store_ps(dadx, dadx1);
- dadx += 4;
- _mm_store_ps(dadx, dadx2);
- dadx += 4;
-
- if (offset == 1)
- {
- GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
- }
- else if (offset == 2)
- {
- GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
- }
- else
- {
- /* offset must be 3 */
- GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
- }
-
- }
- GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
-
- }
-
- /* Parallel summations */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_sum_real(cr->dd, work);
- }
-
- if (gb_algorithm == egbHCT)
- {
- /* HCT */
- for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
- {
- if (born->use[i] != 0)
- {
- rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
- sum = 1.0/rr - work[i];
- min_rad = rr + doffset;
- rad = 1.0/sum;
-
- born->bRad[i] = rad > min_rad ? rad : min_rad;
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
- }
- }
-
- /* Extra communication required for DD */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_spread_real(cr->dd, born->bRad);
- dd_atom_spread_real(cr->dd, fr->invsqrta);
- }
- }
- else
- {
- /* OBC */
- for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
- {
- if (born->use[i] != 0)
- {
- rr = top->atomtypes.gb_radius[md->typeA[i]];
- rr_inv2 = 1.0/rr;
- rr = rr-doffset;
- rr_inv = 1.0/rr;
- sum = rr * work[i];
- sum2 = sum * sum;
- sum3 = sum2 * sum;
-
- tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
- born->bRad[i] = rr_inv - tsum*rr_inv2;
- born->bRad[i] = 1.0 / born->bRad[i];
-
- fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-
- tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
- born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
- }
- }
- /* Extra (local) communication required for DD */
- if (DOMAINDECOMP(cr))
- {
- dd_atom_spread_real(cr->dd, born->bRad);
- dd_atom_spread_real(cr->dd, fr->invsqrta);
- dd_atom_spread_real(cr->dd, born->drobc);
- }
- }
-
-
-
- return 0;
-}
-
-
-
-float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
- float *x, float *f, float *fshift, float *shiftvec,
- int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
-{
- int i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
- int jnrA, jnrB, jnrC, jnrD;
- int j3A, j3B, j3C, j3D;
- int jnrE, jnrF, jnrG, jnrH;
- int j3E, j3F, j3G, j3H;
- int * jjnr;
-
- float rbi, shX, shY, shZ;
- float *rb;
-
- __m128 ix, iy, iz;
- __m128 jx, jy, jz;
- __m128 jxB, jyB, jzB;
- __m128 fix, fiy, fiz;
- __m128 dx, dy, dz;
- __m128 tx, ty, tz;
- __m128 dxB, dyB, dzB;
- __m128 txB, tyB, tzB;
-
- __m128 rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
- __m128 xmm1, xmm2, xmm3;
-
- const __m128 two = _mm_set1_ps(2.0f);
-
- rb = born->work;
-
- jjnr = nl->jjnr;
-
- /* Loop to get the proper form for the Born radius term, sse style */
- offset = natoms%4;
-
- n0 = 0;
- n1 = natoms;
-
- if (gb_algorithm == egbSTILL)
- {
- for (i = n0; i < n1; i++)
- {
- rbi = born->bRad[i];
- rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
- }
- }
- else if (gb_algorithm == egbHCT)
- {
- for (i = n0; i < n1; i++)
- {
- rbi = born->bRad[i];
- rb[i] = rbi * rbi * dvda[i];
- }
- }
- else if (gb_algorithm == egbOBC)
- {
- for (i = n0; i < n1; i++)
- {
- rbi = born->bRad[i];
- rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
- }
- }
-
- jz = _mm_setzero_ps();
-
- n = j3A = j3B = j3C = j3D = 0;
-
- for (i = 0; i < nl->nri; i++)
- {
- ii = nl->iinr[i];
- ii3 = ii*3;
- is3 = 3*nl->shift[i];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
- nj0 = nl->jindex[i];
- nj1 = nl->jindex[i+1];
-
- ix = _mm_set1_ps(shX+x[ii3+0]);
- iy = _mm_set1_ps(shY+x[ii3+1]);
- iz = _mm_set1_ps(shZ+x[ii3+2]);
-
- offset = (nj1-nj0)%4;
-
- rbai = _mm_load1_ps(rb+ii);
- fix = _mm_setzero_ps();
- fiy = _mm_setzero_ps();
- fiz = _mm_setzero_ps();
-
-
- for (k = nj0; k < nj1-offset; k += 4)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- jnrC = jjnr[k+2];
- jnrD = jjnr[k+3];
-
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- j3C = 3*jnrC;
- j3D = 3*jnrD;
-
- GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
-
- dx = _mm_sub_ps(ix, jx);
- dy = _mm_sub_ps(iy, jy);
- dz = _mm_sub_ps(iz, jz);
-
- GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
-
- /* load chain rule terms for j1-4 */
- f_gb = _mm_load_ps(dadx);
- dadx += 4;
- f_gb_ai = _mm_load_ps(dadx);
- dadx += 4;
-
- /* calculate scalar force */
- f_gb = _mm_mul_ps(f_gb, rbai);
- f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
- f_gb = _mm_add_ps(f_gb, f_gb_ai);
-
- tx = _mm_mul_ps(f_gb, dx);
- ty = _mm_mul_ps(f_gb, dy);
- tz = _mm_mul_ps(f_gb, dz);
-
- fix = _mm_add_ps(fix, tx);
- fiy = _mm_add_ps(fiy, ty);
- fiz = _mm_add_ps(fiz, tz);
-
- GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
- }
-
- /*deal with odd elements */
- if (offset != 0)
- {
- if (offset == 1)
- {
- jnrA = jjnr[k];
- j3A = 3*jnrA;
- GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
- GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
- }
- else if (offset == 2)
- {
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
- GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
- }
- else
- {
- /* offset must be 3 */
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
- jnrC = jjnr[k+2];
- j3A = 3*jnrA;
- j3B = 3*jnrB;
- j3C = 3*jnrC;
- GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
- GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
- }
-
- dx = _mm_sub_ps(ix, jx);
- dy = _mm_sub_ps(iy, jy);
- dz = _mm_sub_ps(iz, jz);
-
- /* load chain rule terms for j1-4 */
- f_gb = _mm_load_ps(dadx);
- dadx += 4;
- f_gb_ai = _mm_load_ps(dadx);
- dadx += 4;
-
- /* calculate scalar force */
- f_gb = _mm_mul_ps(f_gb, rbai);
- f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
- f_gb = _mm_add_ps(f_gb, f_gb_ai);
-
- tx = _mm_mul_ps(f_gb, dx);
- ty = _mm_mul_ps(f_gb, dy);
- tz = _mm_mul_ps(f_gb, dz);
-
- fix = _mm_add_ps(fix, tx);
- fiy = _mm_add_ps(fiy, ty);
- fiz = _mm_add_ps(fiz, tz);
-
- if (offset == 1)
- {
- GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
- }
- else if (offset == 2)
- {
- GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
- }
- else
- {
- /* offset must be 3 */
- GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
- }
- }
-
- /* fix/fiy/fiz now contain four partial force terms, that all should be
- * added to the i particle forces and shift forces.
- */
- gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
- }
-
- return 0;
-}
-
-
-#else
-/* keep compiler happy */
-int genborn_sse_dummy;
-
-#endif /* SSE intrinsics available */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _genborn_sse_h
-#define _genborn_sse_h
-
-#include "gromacs/legacyheaders/typedefs.h"
-
-float
-calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
- float *xd, float *f, float *fshift, float *shift_vec,
- int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md);
-
-int
-calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
- float *x, t_nblist *nl, gmx_genborn_t *born);
-
-int
-calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
- float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm);
-
-#endif /* _genborn_sse_h */