Cleanup and remove unused SSE2 generalized born code
authorErik Lindahl <erik@kth.se>
Tue, 7 Jul 2015 15:02:36 +0000 (17:02 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 8 Jul 2015 07:43:06 +0000 (09:43 +0200)
This code has been disabled for quite a while due to
a bug. Since we should anyway move to verlet-style
kernels there is no point in keeping these files around.

Change-Id: Idfd65ac2d0d9f304d548c97e4dbabbaf72df7a7b

src/gromacs/mdlib/genborn.c
src/gromacs/mdlib/genborn_allvsall_sse2_double.c [deleted file]
src/gromacs/mdlib/genborn_allvsall_sse2_double.h [deleted file]
src/gromacs/mdlib/genborn_allvsall_sse2_single.c [deleted file]
src/gromacs/mdlib/genborn_allvsall_sse2_single.h [deleted file]
src/gromacs/mdlib/genborn_sse2_double.c [deleted file]
src/gromacs/mdlib/genborn_sse2_double.h [deleted file]
src/gromacs/mdlib/genborn_sse2_single.c [deleted file]
src/gromacs/mdlib/genborn_sse2_single.h [deleted file]

index a4f34b8ad597615babffb9cc025b5d36dd3ed812..dcfee25ed22557193b6a02208b21eedc813b1c26 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -51,6 +51,7 @@
 #include "gromacs/legacyheaders/types/commrec.h"
 #include "gromacs/math/units.h"
 #include "gromacs/math/vec.h"
+#include "gromacs/mdlib/genborn_allvsall.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/mshift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/smalloc.h"
 
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-#  ifdef GMX_DOUBLE
-#    include "gromacs/mdlib/genborn_allvsall_sse2_double.h"
-#    include "gromacs/mdlib/genborn_sse2_double.h"
-#  else
-#    include "gromacs/mdlib/genborn_allvsall_sse2_single.h"
-#    include "gromacs/mdlib/genborn_sse2_single.h"
-#  endif /* GMX_DOUBLE */
-#endif   /* SSE or AVX present */
-
-#include "gromacs/mdlib/genborn_allvsall.h"
-
-/*#define DISABLE_SSE*/
 
 typedef struct {
     int  shift;
@@ -978,43 +966,13 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
 
         if (ir->gb_algorithm == egbSTILL)
         {
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-            if (fr->use_simd_kernels)
-            {
-#  ifdef GMX_DOUBLE
-                genborn_allvsall_calc_still_radii_sse2_double(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb);
-#  else
-                genborn_allvsall_calc_still_radii_sse2_single(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb);
-#  endif
-            }
-            else
-            {
-                genborn_allvsall_calc_still_radii(fr, md, born, top, x[0], cr, &fr->AllvsAll_workgb);
-            }
-#else
             genborn_allvsall_calc_still_radii(fr, md, born, top, x[0], &fr->AllvsAll_workgb);
-#endif
             /* 13 flops in outer loop, 47 flops in inner loop */
             inc_nrnb(nrnb, eNR_BORN_AVA_RADII_STILL, md->homenr*13+cnt*47);
         }
         else if (ir->gb_algorithm == egbHCT || ir->gb_algorithm == egbOBC)
         {
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-            if (fr->use_simd_kernels)
-            {
-#  ifdef GMX_DOUBLE
-                genborn_allvsall_calc_hct_obc_radii_sse2_double(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb);
-#  else
-                genborn_allvsall_calc_hct_obc_radii_sse2_single(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb);
-#  endif
-            }
-            else
-            {
-                genborn_allvsall_calc_hct_obc_radii(fr, md, born, ir->gb_algorithm, top, x[0], cr, &fr->AllvsAll_workgb);
-            }
-#else
             genborn_allvsall_calc_hct_obc_radii(fr, md, born, ir->gb_algorithm, top, x[0], &fr->AllvsAll_workgb);
-#endif
             /* 24 flops in outer loop, 183 in inner */
             inc_nrnb(nrnb, eNR_BORN_AVA_RADII_HCT_OBC, md->homenr*24+cnt*183);
         }
@@ -1028,45 +986,6 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
     /* Switch for determining which algorithm to use for Born radii calculation */
 #ifdef GMX_DOUBLE
 
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-    /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
-    switch (ir->gb_algorithm)
-    {
-        case egbSTILL:
-            if (fr->use_simd_kernels)
-            {
-                calc_gb_rad_still_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born);
-            }
-            else
-            {
-                calc_gb_rad_still(cr, fr, top, x, nl, born, md);
-            }
-            break;
-        case egbHCT:
-            if (fr->use_simd_kernels)
-            {
-                calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm);
-            }
-            else
-            {
-                calc_gb_rad_hct(cr, fr, top, x, nl, born, md);
-            }
-            break;
-        case egbOBC:
-            if (fr->use_simd_kernels)
-            {
-                calc_gb_rad_hct_obc_sse2_double(cr, fr, born->nr, top, atype, x[0], nl, born, md, ir->gb_algorithm);
-            }
-            else
-            {
-                calc_gb_rad_obc(cr, fr, born->nr, top, x, nl, born, md);
-            }
-            break;
-
-        default:
-            gmx_fatal(FARGS, "Unknown double precision sse-enabled algorithm for Born radii calculation: %d", ir->gb_algorithm);
-    }
-#else
     switch (ir->gb_algorithm)
     {
         case egbSTILL:
@@ -1083,51 +1002,8 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
             gmx_fatal(FARGS, "Unknown double precision algorithm for Born radii calculation: %d", ir->gb_algorithm);
     }
 
-#endif
-
 #else
 
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-    /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
-    switch (ir->gb_algorithm)
-    {
-        case egbSTILL:
-            if (fr->use_simd_kernels)
-            {
-                calc_gb_rad_still_sse2_single(cr, fr, born->nr, top, x[0], nl, born);
-            }
-            else
-            {
-                calc_gb_rad_still(cr, fr, top, x, nl, born, md);
-            }
-            break;
-        case egbHCT:
-            if (fr->use_simd_kernels)
-            {
-                calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm);
-            }
-            else
-            {
-                calc_gb_rad_hct(cr, fr, top, x, nl, born, md);
-            }
-            break;
-
-        case egbOBC:
-            if (fr->use_simd_kernels)
-            {
-                calc_gb_rad_hct_obc_sse2_single(cr, fr, born->nr, top, x[0], nl, born, md, ir->gb_algorithm);
-            }
-            else
-            {
-                calc_gb_rad_obc(cr, fr, born->nr, top, x, nl, born, md);
-            }
-            break;
-
-        default:
-            gmx_fatal(FARGS, "Unknown sse-enabled algorithm for Born radii calculation: %d", ir->gb_algorithm);
-    }
-
-#else
     switch (ir->gb_algorithm)
     {
         case egbSTILL:
@@ -1144,8 +1020,6 @@ int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir, gmx_localtop_t *t
             gmx_fatal(FARGS, "Unknown algorithm for Born radii calculation: %d", ir->gb_algorithm);
     }
 
-#endif /* Single precision sse */
-
 #endif /* Double or single precision */
 
     if (fr->bAllvsAll == FALSE)
@@ -1530,48 +1404,15 @@ calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t
 
     if (fr->bAllvsAll)
     {
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-        if (fr->use_simd_kernels)
-        {
-#  ifdef GMX_DOUBLE
-            genborn_allvsall_calc_chainrule_sse2_double(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
-#  else
-            genborn_allvsall_calc_chainrule_sse2_single(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
-#  endif
-        }
-        else
-        {
-            genborn_allvsall_calc_chainrule(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
-        }
-#else
         genborn_allvsall_calc_chainrule(fr, md, born, x[0], f[0], gb_algorithm, fr->AllvsAll_workgb);
-#endif
         cnt = md->homenr*(md->nr/2+1);
         /* 9 flops for outer loop, 15 for inner */
         inc_nrnb(nrnb, eNR_BORN_AVA_CHAINRULE, md->homenr*9+cnt*15);
         return;
     }
 
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-    if (fr->use_simd_kernels)
-    {
-#  ifdef GMX_DOUBLE
-        calc_gb_chainrule_sse2_double(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x[0],
-                                      f[0], fr->fshift[0], fr->shift_vec[0], gb_algorithm, born, md);
-#  else
-        calc_gb_chainrule_sse2_single(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, x[0],
-                                      f[0], fr->fshift[0], fr->shift_vec[0], gb_algorithm, born, md);
-#  endif
-    }
-    else
-    {
-        calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
-                          x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
-    }
-#else
     calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda,
                       x, f, fr->fshift, fr->shift_vec, gb_algorithm, born);
-#endif
 
     if (!fr->bAllvsAll)
     {
diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_double.c b/src/gromacs/mdlib/genborn_allvsall_sse2_double.c
deleted file mode 100644 (file)
index 5847525..0000000
+++ /dev/null
@@ -1,2506 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2012,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include <math.h>
-
-#include "gromacs/legacyheaders/genborn.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/types/simple.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/genborn_allvsall.h"
-#include "gromacs/utility/smalloc.h"
-
-
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-
-#include <gmx_sse2_double.h>
-
-
-#define SIMD_WIDTH 2
-#define UNROLLI    2
-#define UNROLLJ    2
-
-
-
-
-
-
-
-
-
-typedef struct
-{
-    int   *      jindex_gb;
-    int   **     prologue_mask_gb;
-    int   **     epilogue_mask;
-    int   *      imask;
-    double *     gb_radius;
-    double *     workparam;
-    double *     work;
-    double *     x_align;
-    double *     y_align;
-    double *     z_align;
-    double *     fx_align;
-    double *     fy_align;
-    double *     fz_align;
-}
-gmx_allvsallgb2_data_t;
-
-
-static int
-calc_maxoffset(int i, int natoms)
-{
-    int maxoffset;
-
-    if ((natoms % 2) == 1)
-    {
-        /* Odd number of atoms, easy */
-        maxoffset = natoms/2;
-    }
-    else if ((natoms % 4) == 0)
-    {
-        /* Multiple of four is hard */
-        if (i < natoms/2)
-        {
-            if ((i % 2) == 0)
-            {
-                maxoffset = natoms/2;
-            }
-            else
-            {
-                maxoffset = natoms/2-1;
-            }
-        }
-        else
-        {
-            if ((i % 2) == 1)
-            {
-                maxoffset = natoms/2;
-            }
-            else
-            {
-                maxoffset = natoms/2-1;
-            }
-        }
-    }
-    else
-    {
-        /* natoms/2 = odd */
-        if ((i % 2) == 0)
-        {
-            maxoffset = natoms/2;
-        }
-        else
-        {
-            maxoffset = natoms/2-1;
-        }
-    }
-
-    return maxoffset;
-}
-
-static void
-setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
-                                t_ilist     *                  ilist,
-                                int                            start,
-                                int                            end,
-                                int                            natoms,
-                                gmx_bool                       bInclude12,
-                                gmx_bool                       bInclude13,
-                                gmx_bool                       bInclude14)
-{
-    int   i, j, k, tp;
-    int   a1, a2;
-    int   ni0, ni1, nj0, nj1, nj;
-    int   imin, imax, iexcl;
-    int   max_offset;
-    int   max_excl_offset;
-    int   firstinteraction;
-    int   ibase;
-    int  *pi;
-
-    /* This routine can appear to be a bit complex, but it is mostly book-keeping.
-     * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
-     * whether they should interact or not.
-     *
-     * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
-     * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
-     * we create a jindex array with three elements per i atom: the starting point, the point to
-     * which we need to check exclusions, and the end point.
-     * This way we only have to allocate a short exclusion mask per i atom.
-     */
-
-    ni0 = (start/UNROLLI)*UNROLLI;
-    ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
-
-    /* Set the interaction mask to only enable the i atoms we want to include */
-    snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH));
-    aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-    for (i = 0; i < natoms+UNROLLI; i++)
-    {
-        aadata->imask[2*i]   = (i >= start && i < end) ? 0xFFFFFFFF : 0;
-        aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
-    }
-
-    /* Allocate memory for our modified jindex array */
-    snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
-    for (i = 0; i < 4*(natoms+UNROLLI); i++)
-    {
-        aadata->jindex_gb[i] = 0;
-    }
-
-    /* Create the exclusion masks for the prologue part */
-    snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
-
-    /* First zero everything to avoid uninitialized data */
-    for (i = 0; i < natoms+UNROLLI; i++)
-    {
-        aadata->prologue_mask_gb[i] = NULL;
-    }
-
-    /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
-    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
-    {
-        max_excl_offset = -1;
-
-        /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
-        imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
-
-        /* Which atom is the first we (might) interact with? */
-        imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
-        for (i = ibase; i < imax; i++)
-        {
-            /* Before exclusions, which atom is the first we (might) interact with? */
-            firstinteraction = i+1;
-            max_offset       = calc_maxoffset(i, natoms);
-
-            if (!bInclude12)
-            {
-                for (j = 0; j < ilist[F_GB12].nr; j += 3)
-                {
-                    a1 = ilist[F_GB12].iatoms[j+1];
-                    a2 = ilist[F_GB12].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k == firstinteraction)
-                    {
-                        firstinteraction++;
-                    }
-                }
-            }
-            if (!bInclude13)
-            {
-                for (j = 0; j < ilist[F_GB13].nr; j += 3)
-                {
-                    a1 = ilist[F_GB13].iatoms[j+1];
-                    a2 = ilist[F_GB13].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k == firstinteraction)
-                    {
-                        firstinteraction++;
-                    }
-                }
-            }
-            if (!bInclude14)
-            {
-                for (j = 0; j < ilist[F_GB14].nr; j += 3)
-                {
-                    a1 = ilist[F_GB14].iatoms[j+1];
-                    a2 = ilist[F_GB14].iatoms[j+2];
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k == firstinteraction)
-                    {
-                        firstinteraction++;
-                    }
-                }
-            }
-            imin = (firstinteraction < imin) ? firstinteraction : imin;
-        }
-        /* round down to j unrolling factor */
-        imin = (imin/UNROLLJ)*UNROLLJ;
-
-        for (i = ibase; i < imax; i++)
-        {
-            max_offset = calc_maxoffset(i, natoms);
-
-            if (!bInclude12)
-            {
-                for (j = 0; j < ilist[F_GB12].nr; j += 3)
-                {
-                    a1 = ilist[F_GB12].iatoms[j+1];
-                    a2 = ilist[F_GB12].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k < imin)
-                    {
-                        k += natoms;
-                    }
-
-                    if (k > i+max_offset)
-                    {
-                        continue;
-                    }
-
-                    k = k - imin;
-
-                    if (k+natoms <= max_offset)
-                    {
-                        k += natoms;
-                    }
-                    max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
-                }
-            }
-            if (!bInclude13)
-            {
-                for (j = 0; j < ilist[F_GB13].nr; j += 3)
-                {
-                    a1 = ilist[F_GB13].iatoms[j+1];
-                    a2 = ilist[F_GB13].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k < imin)
-                    {
-                        k += natoms;
-                    }
-
-                    if (k > i+max_offset)
-                    {
-                        continue;
-                    }
-
-                    k = k - imin;
-
-                    if (k+natoms <= max_offset)
-                    {
-                        k += natoms;
-                    }
-                    max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
-                }
-            }
-            if (!bInclude14)
-            {
-                for (j = 0; j < ilist[F_GB14].nr; j += 3)
-                {
-                    a1 = ilist[F_GB14].iatoms[j+1];
-                    a2 = ilist[F_GB14].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k < imin)
-                    {
-                        k += natoms;
-                    }
-
-                    if (k > i+max_offset)
-                    {
-                        continue;
-                    }
-
-                    k = k - imin;
-
-                    if (k+natoms <= max_offset)
-                    {
-                        k += natoms;
-                    }
-                    max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
-                }
-            }
-        }
-
-        /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
-        max_excl_offset++;
-        /* round up to j unrolling factor */
-        max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
-
-        /* Set all the prologue masks length to this value (even for i>end) */
-        for (i = ibase; i < ibase+UNROLLI; i++)
-        {
-            aadata->jindex_gb[4*i]   = imin;
-            aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
-        }
-    }
-
-    /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
-    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
-    {
-        for (i = ibase; i < ibase+UNROLLI; i++)
-        {
-            nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
-            imin = aadata->jindex_gb[4*i];
-
-            /* Allocate aligned memory */
-            snew(pi, 2*(nj+2*SIMD_WIDTH));
-            aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-
-            max_offset = calc_maxoffset(i, natoms);
-
-            /* Include interactions i+1 <= j < i+maxoffset */
-            for (k = 0; k < nj; k++)
-            {
-                j = imin + k;
-
-                if ( (j > i) && (j <= i+max_offset) )
-                {
-                    aadata->prologue_mask_gb[i][2*k]   = 0xFFFFFFFF;
-                    aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
-                }
-                else
-                {
-                    aadata->prologue_mask_gb[i][2*k]   = 0;
-                    aadata->prologue_mask_gb[i][2*k+1] = 0;
-                }
-            }
-
-            /* Clear out the explicit exclusions */
-            if (i < end)
-            {
-                if (!bInclude12)
-                {
-                    for (j = 0; j < ilist[F_GB12].nr; j += 3)
-                    {
-                        a1 = ilist[F_GB12].iatoms[j+1];
-                        a2 = ilist[F_GB12].iatoms[j+2];
-
-                        if (a1 == i)
-                        {
-                            k = a2;
-                        }
-                        else if (a2 == i)
-                        {
-                            k = a1;
-                        }
-                        else
-                        {
-                            continue;
-                        }
-
-                        if (k > i+max_offset)
-                        {
-                            continue;
-                        }
-                        k = k-i;
-
-                        if (k+natoms <= max_offset)
-                        {
-                            k += natoms;
-                        }
-
-                        k = k+i-imin;
-                        if (k >= 0)
-                        {
-                            aadata->prologue_mask_gb[i][2*k]   = 0;
-                            aadata->prologue_mask_gb[i][2*k+1] = 0;
-                        }
-                    }
-                }
-                if (!bInclude13)
-                {
-                    for (j = 0; j < ilist[F_GB13].nr; j += 3)
-                    {
-                        a1 = ilist[F_GB13].iatoms[j+1];
-                        a2 = ilist[F_GB13].iatoms[j+2];
-
-                        if (a1 == i)
-                        {
-                            k = a2;
-                        }
-                        else if (a2 == i)
-                        {
-                            k = a1;
-                        }
-                        else
-                        {
-                            continue;
-                        }
-
-                        if (k > i+max_offset)
-                        {
-                            continue;
-                        }
-                        k = k-i;
-
-                        if (k+natoms <= max_offset)
-                        {
-                            k += natoms;
-                        }
-
-                        k = k+i-imin;
-                        if (k >= 0)
-                        {
-                            aadata->prologue_mask_gb[i][2*k]   = 0;
-                            aadata->prologue_mask_gb[i][2*k+1] = 0;
-                        }
-                    }
-                }
-                if (!bInclude14)
-                {
-                    for (j = 0; j < ilist[F_GB14].nr; j += 3)
-                    {
-                        a1 = ilist[F_GB14].iatoms[j+1];
-                        a2 = ilist[F_GB14].iatoms[j+2];
-
-                        if (a1 == i)
-                        {
-                            k = a2;
-                        }
-                        else if (a2 == i)
-                        {
-                            k = a1;
-                        }
-                        else
-                        {
-                            continue;
-                        }
-
-                        if (k > i+max_offset)
-                        {
-                            continue;
-                        }
-                        k = k-i;
-
-                        if (k+natoms <= max_offset)
-                        {
-                            k += natoms;
-                        }
-
-                        k = k+i-imin;
-                        if (k >= 0)
-                        {
-                            aadata->prologue_mask_gb[i][2*k]   = 0;
-                            aadata->prologue_mask_gb[i][2*k+1] = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    /* Construct the epilogue mask - this just contains the check for maxoffset */
-    snew(aadata->epilogue_mask, natoms+UNROLLI);
-
-    /* First zero everything to avoid uninitialized data */
-    for (i = 0; i < natoms+UNROLLI; i++)
-    {
-        aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
-        aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
-        aadata->epilogue_mask[i]    = NULL;
-    }
-
-    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
-    {
-        /* Find the lowest index for which we need to use the epilogue */
-        imin       = ibase;
-        max_offset = calc_maxoffset(imin, natoms);
-
-        imin = imin + 1 + max_offset;
-
-        /* Find largest index for which we need to use the epilogue */
-        imax = ibase + UNROLLI-1;
-        imax = (imax < end) ? imax : end;
-
-        max_offset = calc_maxoffset(imax, natoms);
-        imax       = imax + 1 + max_offset + UNROLLJ - 1;
-
-        for (i = ibase; i < ibase+UNROLLI; i++)
-        {
-            /* Start of epilogue - round down to j tile limit */
-            aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
-            /* Make sure we dont overlap - for small systems everything is done in the prologue */
-            aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
-            /* Round upwards to j tile limit */
-            aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
-            /* Make sure we dont have a negative range for the epilogue */
-            aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
-        }
-    }
-
-    /* And fill it with data... */
-
-    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
-    {
-        for (i = ibase; i < ibase+UNROLLI; i++)
-        {
-
-            nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
-
-            /* Allocate aligned memory */
-            snew(pi, 2*(nj+2*SIMD_WIDTH));
-            aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-
-            max_offset = calc_maxoffset(i, natoms);
-
-            for (k = 0; k < nj; k++)
-            {
-                j = aadata->jindex_gb[4*i+2] + k;
-                aadata->epilogue_mask[i][2*k]   = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
-                aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
-            }
-        }
-    }
-}
-
-
-static void
-genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
-                       gmx_localtop_t     *           top,
-                       gmx_genborn_t     *            born,
-                       t_mdatoms     *                mdatoms,
-                       double                         radius_offset,
-                       int                            gb_algorithm,
-                       gmx_bool                       bInclude12,
-                       gmx_bool                       bInclude13,
-                       gmx_bool                       bInclude14)
-{
-    int                     i, j, idx;
-    int                     natoms;
-    gmx_allvsallgb2_data_t *aadata;
-    double                 *p;
-
-    natoms = mdatoms->nr;
-
-    snew(aadata, 1);
-    *p_aadata = aadata;
-
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
-    aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
-    aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
-    aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    for (i = 0; i < mdatoms->nr; i++)
-    {
-        aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
-        if (gb_algorithm == egbSTILL)
-        {
-            aadata->workparam[i] = born->vsolv[i];
-        }
-        else if (gb_algorithm == egbOBC)
-        {
-            aadata->workparam[i] = born->param[i];
-        }
-        aadata->work[i]      = 0.0;
-    }
-    for (i = 0; i < mdatoms->nr; i++)
-    {
-        aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
-        aadata->workparam[natoms+i] = aadata->workparam[i];
-        aadata->work[natoms+i]      = aadata->work[i];
-    }
-
-    for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
-    {
-        aadata->x_align[i]  = 0.0;
-        aadata->y_align[i]  = 0.0;
-        aadata->z_align[i]  = 0.0;
-        aadata->fx_align[i] = 0.0;
-        aadata->fy_align[i] = 0.0;
-        aadata->fz_align[i] = 0.0;
-    }
-
-    setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
-                                    bInclude12, bInclude13, bInclude14);
-}
-
-
-/*
- * This routine apparently hits a compiler bug visual studio has had 'forever'.
- * It is present both in VS2005 and VS2008, and the only way around it is to
- * decrease optimization. We do that with at pragma, and only for MSVC, so it
- * will not hurt any of the well-behaving and supported compilers out there.
- * MS: Fix your compiler, it sucks like a black hole!
- */
-#ifdef _MSC_VER
-#pragma optimize("t",off)
-#endif
-
-int
-genborn_allvsall_calc_still_radii_sse2_double(t_forcerec   *           fr,
-                                              t_mdatoms   *            mdatoms,
-                                              gmx_genborn_t   *        born,
-                                              gmx_localtop_t   *       top,
-                                              double *                 x,
-                                              t_commrec   *            cr,
-                                              void   *                 paadata)
-{
-    gmx_allvsallgb2_data_t *aadata;
-    int                     natoms;
-    int                     ni0, ni1;
-    int                     nj0, nj1, nj2, nj3;
-    int                     i, j, k, n;
-    int              *      mask;
-    int              *      pmask0;
-    int              *      pmask1;
-    int              *      emask0;
-    int              *      emask1;
-    double                  ix, iy, iz;
-    double                  jx, jy, jz;
-    double                  dx, dy, dz;
-    double                  rsq, rinv;
-    double                  gpi, rai, vai;
-    double                  prod_ai;
-    double                  irsq, idr4, idr6;
-    double                  raj, rvdw, ratio;
-    double                  vaj, ccf, dccf, theta, cosq;
-    double                  term, prod, icf4, icf6, gpi2, factor, sinq;
-    double            *     gb_radius;
-    double            *     vsolv;
-    double            *     work;
-    double                  tmpsum[2];
-    double            *     x_align;
-    double            *     y_align;
-    double            *     z_align;
-    int              *      jindex;
-    double            *     dadx;
-
-    __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
-    __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
-    __m128d                 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
-    __m128d                 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
-    __m128d                 imask_SSE0, jmask_SSE0;
-    __m128d                 imask_SSE1, jmask_SSE1;
-    __m128d                 jx_SSE, jy_SSE, jz_SSE;
-    __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
-    __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
-    __m128d                 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
-    __m128d                 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
-    __m128d                 raj_SSE, vaj_SSE, prod_SSE;
-    __m128d                 rvdw_SSE0, ratio_SSE0;
-    __m128d                 rvdw_SSE1, ratio_SSE1;
-    __m128d                 theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
-    __m128d                 theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
-    __m128d                 ccf_SSE0, dccf_SSE0;
-    __m128d                 ccf_SSE1, dccf_SSE1;
-    __m128d                 icf4_SSE0, icf6_SSE0;
-    __m128d                 icf4_SSE1, icf6_SSE1;
-    __m128d                 half_SSE, one_SSE, two_SSE, four_SSE;
-    __m128d                 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
-
-    natoms              = mdatoms->nr;
-    ni0                 = 0;
-    ni1                 = mdatoms->homenr;
-
-    n = 0;
-
-    aadata = *((gmx_allvsallgb2_data_t **)paadata);
-
-
-    if (aadata == NULL)
-    {
-        genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
-                               egbSTILL, FALSE, FALSE, TRUE);
-        *((gmx_allvsallgb2_data_t **)paadata) = aadata;
-    }
-
-    x_align = aadata->x_align;
-    y_align = aadata->y_align;
-    z_align = aadata->z_align;
-
-    gb_radius = aadata->gb_radius;
-    vsolv     = aadata->workparam;
-    work      = aadata->work;
-    jindex    = aadata->jindex_gb;
-    dadx      = fr->dadx;
-
-    still_p4_SSE    = _mm_set1_pd(STILL_P4);
-    still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
-    still_pip5_SSE  = _mm_set1_pd(STILL_PIP5);
-    half_SSE        = _mm_set1_pd(0.5);
-    one_SSE         = _mm_set1_pd(1.0);
-    two_SSE         = _mm_set1_pd(2.0);
-    four_SSE        = _mm_set1_pd(4.0);
-
-    /* This will be summed, so it has to extend to natoms + buffer */
-    for (i = 0; i < natoms+1+natoms/2; i++)
-    {
-        work[i] = 0;
-    }
-
-    for (i = ni0; i < ni1+1+natoms/2; i++)
-    {
-        k           = i%natoms;
-        x_align[i]  = x[3*k];
-        y_align[i]  = x[3*k+1];
-        z_align[i]  = x[3*k+2];
-        work[i]     = 0;
-    }
-
-    for (i = ni0; i < ni1; i += UNROLLI)
-    {
-        /* We assume shifts are NOT used for all-vs-all interactions */
-        /* Load i atom data */
-        ix_SSE0          = _mm_load1_pd(x_align+i);
-        iy_SSE0          = _mm_load1_pd(y_align+i);
-        iz_SSE0          = _mm_load1_pd(z_align+i);
-        ix_SSE1          = _mm_load1_pd(x_align+i+1);
-        iy_SSE1          = _mm_load1_pd(y_align+i+1);
-        iz_SSE1          = _mm_load1_pd(z_align+i+1);
-
-        gpi_SSE0         = _mm_setzero_pd();
-        gpi_SSE1         = _mm_setzero_pd();
-
-        rai_SSE0         = _mm_load1_pd(gb_radius+i);
-        rai_SSE1         = _mm_load1_pd(gb_radius+i+1);
-
-        prod_ai_SSE0     = _mm_set1_pd(STILL_P4*vsolv[i]);
-        prod_ai_SSE1     = _mm_set1_pd(STILL_P4*vsolv[i+1]);
-
-        /* Load limits for loop over neighbors */
-        nj0              = jindex[4*i];
-        nj1              = jindex[4*i+1];
-        nj2              = jindex[4*i+2];
-        nj3              = jindex[4*i+3];
-
-        pmask0           = aadata->prologue_mask_gb[i];
-        pmask1           = aadata->prologue_mask_gb[i+1];
-        emask0           = aadata->epilogue_mask[i];
-        emask1           = aadata->epilogue_mask[i+1];
-
-        imask_SSE0        = _mm_load1_pd((double *)(aadata->imask+2*i));
-        imask_SSE1        = _mm_load1_pd((double *)(aadata->imask+2*i+2));
-
-        /* Prologue part, including exclusion mask */
-        for (j = nj0; j < nj1; j += UNROLLJ)
-        {
-            jmask_SSE0 = _mm_load_pd((double *)pmask0);
-            jmask_SSE1 = _mm_load_pd((double *)pmask1);
-            pmask0    += 2*UNROLLJ;
-            pmask1    += 2*UNROLLJ;
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_pd(x_align+j);
-            jy_SSE            = _mm_load_pd(y_align+j);
-            jz_SSE            = _mm_load_pd(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
-            /* Combine masks */
-            jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
-            jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
-            rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
-
-            irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
-            irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
-            idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
-            idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
-            idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
-            idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
-
-            raj_SSE            = _mm_load_pd(gb_radius+j);
-            vaj_SSE            = _mm_load_pd(vsolv+j);
-
-            rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
-            rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
-
-            ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
-
-            ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
-            theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
-            gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
-            gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
-            term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
-            term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
-            ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
-            ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
-            dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
-                                            _mm_mul_pd(sinq_SSE0, theta_SSE0));
-            dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
-                                            _mm_mul_pd(sinq_SSE1, theta_SSE1));
-
-            prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE);
-            icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
-            icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
-            icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-
-            _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
-                                            _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
-                                                       _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
-
-
-            gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
-            gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
-
-            /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
-            dadx += 2;
-
-            _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
-            dadx += 2;
-        }
-
-        /* Main part, no exclusions */
-        for (j = nj1; j < nj2; j += UNROLLJ)
-        {
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_pd(x_align+j);
-            jy_SSE            = _mm_load_pd(y_align+j);
-            jz_SSE            = _mm_load_pd(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_pd(rinv_SSE0, imask_SSE0);
-            rinv_SSE1          = _mm_and_pd(rinv_SSE1, imask_SSE1);
-
-            irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
-            irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
-            idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
-            idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
-            idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
-            idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
-
-            raj_SSE            = _mm_load_pd(gb_radius+j);
-
-            rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
-            rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
-            vaj_SSE            = _mm_load_pd(vsolv+j);
-
-            ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
-
-            ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
-            theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
-            gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
-            gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
-            term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
-            term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
-            ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
-            ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
-            dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
-                                            _mm_mul_pd(sinq_SSE0, theta_SSE0));
-            dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
-                                            _mm_mul_pd(sinq_SSE1, theta_SSE1));
-
-            prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE );
-            icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
-            icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
-            icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-
-            _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
-                                            _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
-                                                       _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
-
-            gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
-            gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
-
-            /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
-            dadx += 2;
-
-            _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
-            dadx += 2;
-        }
-        /* Epilogue part, including exclusion mask */
-        for (j = nj2; j < nj3; j += UNROLLJ)
-        {
-            jmask_SSE0 = _mm_load_pd((double *)emask0);
-            jmask_SSE1 = _mm_load_pd((double *)emask1);
-            emask0    += 2*UNROLLJ;
-            emask1    += 2*UNROLLJ;
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_pd(x_align+j);
-            jy_SSE            = _mm_load_pd(y_align+j);
-            jz_SSE            = _mm_load_pd(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
-            /* Combine masks */
-            jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
-            jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
-            rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
-
-            irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
-            irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
-            idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
-            idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
-            idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
-            idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
-
-            raj_SSE            = _mm_load_pd(gb_radius+j);
-            vaj_SSE            = _mm_load_pd(vsolv+j);
-
-            rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
-            rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
-
-            ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
-
-            ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
-            theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
-            gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
-            gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
-            term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
-            term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
-            ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
-            ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
-            dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
-                                            _mm_mul_pd(sinq_SSE0, theta_SSE0));
-            dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
-                                            _mm_mul_pd(sinq_SSE1, theta_SSE1));
-
-            prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE);
-            icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
-            icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
-            icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-
-            _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
-                                            _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
-                                                       _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
-
-            gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
-            gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
-
-            /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
-            dadx += 2;
-
-            _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
-            dadx += 2;
-        }
-        GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1);
-        gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1);
-        _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
-    }
-
-    /* In case we have written anything beyond natoms, move it back.
-     * Never mind that we leave stuff above natoms; that will not
-     * be accessed later in the routine.
-     * In principle this should be a move rather than sum, but this
-     * way we dont have to worry about even/odd offsets...
-     */
-    for (i = natoms; i < ni1+1+natoms/2; i++)
-    {
-        work[i-natoms] += work[i];
-    }
-
-    /* Parallel summations would go here if ever implemented with DD */
-
-    factor  = 0.5 * ONE_4PI_EPS0;
-    /* Calculate the radii - should we do all atoms, or just our local ones? */
-    for (i = 0; i < natoms; i++)
-    {
-        if (born->use[i] != 0)
-        {
-            gpi             = born->gpol[i]+work[i];
-            gpi2            = gpi * gpi;
-            born->bRad[i]   = factor*gmx_invsqrt(gpi2);
-            fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-        }
-    }
-
-    return 0;
-}
-/* Reinstate MSVC optimization */
-#ifdef _MSC_VER
-#pragma optimize("",on)
-#endif
-
-
-int
-genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec   *           fr,
-                                                t_mdatoms   *            mdatoms,
-                                                gmx_genborn_t   *        born,
-                                                int                      gb_algorithm,
-                                                gmx_localtop_t   *       top,
-                                                double *                 x,
-                                                t_commrec   *            cr,
-                                                void   *                 paadata)
-{
-    gmx_allvsallgb2_data_t *aadata;
-    int                     natoms;
-    int                     ni0, ni1;
-    int                     nj0, nj1, nj2, nj3;
-    int                     i, j, k, n;
-    int              *      mask;
-    int              *      pmask0;
-    int              *      pmask1;
-    int              *      emask0;
-    int              *      emask1;
-    double            *     gb_radius;
-    double            *     vsolv;
-    double            *     work;
-    double                  tmpsum[2];
-    double            *     x_align;
-    double            *     y_align;
-    double            *     z_align;
-    int              *      jindex;
-    double            *     dadx;
-    double            *     obc_param;
-    double                  rad, min_rad;
-    double                  rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
-
-    __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
-    __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
-    __m128d                 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
-    __m128d                 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
-    __m128d                 imask_SSE0, jmask_SSE0;
-    __m128d                 imask_SSE1, jmask_SSE1;
-    __m128d                 jx_SSE, jy_SSE, jz_SSE;
-    __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
-    __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
-    __m128d                 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
-    __m128d                 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
-    __m128d                 raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
-    __m128d                 ccf_SSE0, dccf_SSE0, prod_SSE0;
-    __m128d                 ccf_SSE1, dccf_SSE1, prod_SSE1;
-    __m128d                 icf4_SSE0, icf6_SSE0;
-    __m128d                 icf4_SSE1, icf6_SSE1;
-    __m128d                 oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
-    __m128d                 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
-    __m128d                 rai_inv_SSE0;
-    __m128d                 rai_inv_SSE1;
-    __m128d                 sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
-    __m128d                 sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
-    __m128d                 lij_inv_SSE0, sk2_rinv_SSE0;
-    __m128d                 lij_inv_SSE1, sk2_rinv_SSE1;
-    __m128d                 dr_SSE0;
-    __m128d                 dr_SSE1;
-    __m128d                 t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
-    __m128d                 t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
-    __m128d                 obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
-    __m128d                 obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
-    __m128d                 uij_SSE0, uij2_SSE0, uij3_SSE0;
-    __m128d                 uij_SSE1, uij2_SSE1, uij3_SSE1;
-    __m128d                 lij_SSE0, lij2_SSE0, lij3_SSE0;
-    __m128d                 lij_SSE1, lij2_SSE1, lij3_SSE1;
-    __m128d                 dlij_SSE0, diff2_SSE0, logterm_SSE0;
-    __m128d                 dlij_SSE1, diff2_SSE1, logterm_SSE1;
-    __m128d                 doffset_SSE, tmpSSE;
-
-    natoms              = mdatoms->nr;
-    ni0                 = 0;
-    ni1                 = mdatoms->homenr;
-
-    n = 0;
-
-    aadata = *((gmx_allvsallgb2_data_t **)paadata);
-
-
-    if (aadata == NULL)
-    {
-        genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
-                               egbOBC, TRUE, TRUE, TRUE);
-        *((gmx_allvsallgb2_data_t **)paadata) = aadata;
-    }
-
-    x_align = aadata->x_align;
-    y_align = aadata->y_align;
-    z_align = aadata->z_align;
-
-    gb_radius = aadata->gb_radius;
-    work      = aadata->work;
-    jindex    = aadata->jindex_gb;
-    dadx      = fr->dadx;
-    obc_param = aadata->workparam;
-
-    oneeighth_SSE   = _mm_set1_pd(0.125);
-    onefourth_SSE   = _mm_set1_pd(0.25);
-    half_SSE        = _mm_set1_pd(0.5);
-    one_SSE         = _mm_set1_pd(1.0);
-    two_SSE         = _mm_set1_pd(2.0);
-    four_SSE        = _mm_set1_pd(4.0);
-    doffset_SSE     = _mm_set1_pd(born->gb_doffset);
-
-    for (i = 0; i < natoms; i++)
-    {
-        x_align[i]  = x[3*i];
-        y_align[i]  = x[3*i+1];
-        z_align[i]  = x[3*i+2];
-    }
-
-    /* Copy again */
-    for (i = 0; i < natoms/2+1; i++)
-    {
-        x_align[natoms+i]  = x_align[i];
-        y_align[natoms+i]  = y_align[i];
-        z_align[natoms+i]  = z_align[i];
-    }
-
-    for (i = 0; i < natoms+natoms/2+1; i++)
-    {
-        work[i] = 0;
-    }
-
-    for (i = ni0; i < ni1; i += UNROLLI)
-    {
-        /* We assume shifts are NOT used for all-vs-all interactions */
-
-        /* Load i atom data */
-        ix_SSE0          = _mm_load1_pd(x_align+i);
-        iy_SSE0          = _mm_load1_pd(y_align+i);
-        iz_SSE0          = _mm_load1_pd(z_align+i);
-        ix_SSE1          = _mm_load1_pd(x_align+i+1);
-        iy_SSE1          = _mm_load1_pd(y_align+i+1);
-        iz_SSE1          = _mm_load1_pd(z_align+i+1);
-
-        rai_SSE0         = _mm_load1_pd(gb_radius+i);
-        rai_SSE1         = _mm_load1_pd(gb_radius+i+1);
-        rai_inv_SSE0     = gmx_mm_inv_pd(rai_SSE0);
-        rai_inv_SSE1     = gmx_mm_inv_pd(rai_SSE1);
-
-        sk_ai_SSE0       = _mm_load1_pd(obc_param+i);
-        sk_ai_SSE1       = _mm_load1_pd(obc_param+i+1);
-        sk2_ai_SSE0      = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0);
-        sk2_ai_SSE1      = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1);
-
-        sum_ai_SSE0      = _mm_setzero_pd();
-        sum_ai_SSE1      = _mm_setzero_pd();
-
-        /* Load limits for loop over neighbors */
-        nj0              = jindex[4*i];
-        nj1              = jindex[4*i+1];
-        nj2              = jindex[4*i+2];
-        nj3              = jindex[4*i+3];
-
-        pmask0           = aadata->prologue_mask_gb[i];
-        pmask1           = aadata->prologue_mask_gb[i+1];
-        emask0           = aadata->epilogue_mask[i];
-        emask1           = aadata->epilogue_mask[i+1];
-
-        imask_SSE0        = _mm_load1_pd((double *)(aadata->imask+2*i));
-        imask_SSE1        = _mm_load1_pd((double *)(aadata->imask+2*i+2));
-
-        /* Prologue part, including exclusion mask */
-        for (j = nj0; j < nj1; j += UNROLLJ)
-        {
-            jmask_SSE0 = _mm_load_pd((double *)pmask0);
-            jmask_SSE1 = _mm_load_pd((double *)pmask1);
-            pmask0    += 2*UNROLLJ;
-            pmask1    += 2*UNROLLJ;
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_pd(x_align+j);
-            jy_SSE            = _mm_load_pd(y_align+j);
-            jz_SSE            = _mm_load_pd(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
-            /* Combine masks */
-            jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
-            jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
-            rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
-
-            dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
-            dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
-
-            sk_aj_SSE          = _mm_load_pd(obc_param+j);
-            raj_SSE            = _mm_load_pd(gb_radius+j);
-            raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
-            t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
-            t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
-            t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
-            t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
-            t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
-
-            obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
-            obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
-            obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
-            obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
-
-            uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
-            lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
-                                              _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
-            lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
-                                              _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
-            dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
-            uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
-            uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
-            lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
-            lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
-            diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
-            lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
-            sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
-            prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
-            logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-
-            t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
-            t2_SSE0            = _mm_mul_pd(diff2_SSE0,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_pd(diff2_SSE1,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-
-            t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
-            t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
-            t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
-            t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
-            t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
-            t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
-            sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-
-            t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
-                                            _mm_mul_pd(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
-                                            _mm_mul_pd(prod_SSE1, lij3_SSE1));
-            t1_SSE0            = _mm_sub_pd(t1_SSE0,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_pd(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_pd(t1_SSE1,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-
-            t2_SSE0            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_pd(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_pd(uij3_SSE1, dr_SSE1)));
-            t2_SSE0            = _mm_sub_pd(t2_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
-                                                       _mm_mul_pd(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_pd(t2_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
-                                                       _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-            t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_pd(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-            t3_SSE0            = _mm_sub_pd(t3_SSE0,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_pd(t3_SSE1,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
-            t1_SSE0            = _mm_mul_pd(rinv_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_pd(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_pd(rinv_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_pd(t2_SSE1, t3_SSE1)));
-
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-            dadx += 2;
-
-            /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
-            t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
-            t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
-            t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
-            t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
-
-            obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
-            obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
-            obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
-            obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
-
-            uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
-            lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
-                                              _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
-            lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
-                                              _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
-            dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
-            uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
-            uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
-            lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
-            lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
-            diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
-            lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
-            sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
-            prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
-            logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-            t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
-            t2_SSE0            = _mm_mul_pd(diff2_SSE0,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_pd(diff2_SSE1,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
-            t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
-            t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
-            t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
-            t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
-            t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
-            _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
-                                            _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
-                                                       _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
-
-            t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
-                                            _mm_mul_pd(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
-                                            _mm_mul_pd(prod_SSE1, lij3_SSE1));
-            t1_SSE0            = _mm_sub_pd(t1_SSE0,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_pd(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_pd(t1_SSE1,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-            t2_SSE0            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_pd(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_pd(uij3_SSE1, dr_SSE1)));
-            t2_SSE0            = _mm_sub_pd(t2_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
-                                                       _mm_mul_pd(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_pd(t2_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
-                                                       _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-
-            t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_pd(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-
-            t3_SSE0            = _mm_sub_pd(t3_SSE0,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_pd(t3_SSE1,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
-
-            t1_SSE0            = _mm_mul_pd(rinv_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_pd(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_pd(rinv_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_pd(t2_SSE1, t3_SSE1)));
-
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-            dadx += 2;
-        }
-
-        /* Main part, no exclusions */
-        for (j = nj1; j < nj2; j += UNROLLJ)
-        {
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_pd(x_align+j);
-            jy_SSE            = _mm_load_pd(y_align+j);
-            jz_SSE            = _mm_load_pd(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_pd(rinv_SSE0, imask_SSE0);
-            rinv_SSE1          = _mm_and_pd(rinv_SSE1, imask_SSE1);
-
-            dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
-            dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
-
-            sk_aj_SSE          = _mm_load_pd(obc_param+j);
-            raj_SSE            = _mm_load_pd(gb_radius+j);
-
-            raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
-            t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
-            t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
-            t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
-            t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
-            t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
-
-            obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
-            obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
-            obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
-            obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
-            obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
-
-            uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
-            lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
-                                              _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
-            lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
-                                              _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
-            dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
-            uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
-            uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
-            lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
-            lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
-            diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
-            lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
-            sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
-            prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
-            logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-
-            t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
-            t2_SSE0            = _mm_mul_pd(diff2_SSE0,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_pd(diff2_SSE1,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-
-            t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
-            t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
-            t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
-            t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
-            t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
-            t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
-            sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-
-            t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
-                                            _mm_mul_pd(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
-                                            _mm_mul_pd(prod_SSE1, lij3_SSE1));
-
-            t1_SSE0            = _mm_sub_pd(t1_SSE0,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_pd(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_pd(t1_SSE1,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-
-            t2_SSE0            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_pd(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_pd(uij3_SSE1, dr_SSE1)));
-            t2_SSE0            = _mm_sub_pd(t2_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
-                                                       _mm_mul_pd(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_pd(t2_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
-                                                       _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-            t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_pd(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-            t3_SSE0            = _mm_sub_pd(t3_SSE0,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_pd(t3_SSE1,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
-            t1_SSE0            = _mm_mul_pd(rinv_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_pd(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_pd(rinv_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_pd(t2_SSE1, t3_SSE1)));
-
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-            dadx += 2;
-
-            /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
-            t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
-            t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
-            t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
-            t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
-
-            obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
-            obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
-            obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
-            obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
-            obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
-
-            uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
-            lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
-                                              _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
-            lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
-                                              _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
-            dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
-            uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
-            uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
-            lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
-            lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
-            diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
-            lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
-            sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
-            prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
-            logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-            t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
-            t2_SSE0            = _mm_mul_pd(diff2_SSE0,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_pd(diff2_SSE1,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
-            t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
-            t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
-            t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
-            t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
-            t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
-            _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
-                                            _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
-                                                       _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
-
-            t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
-                                            _mm_mul_pd(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
-                                            _mm_mul_pd(prod_SSE1, lij3_SSE1));
-            t1_SSE0            = _mm_sub_pd(t1_SSE0,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_pd(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_pd(t1_SSE1,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-            t2_SSE0            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_pd(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_pd(uij3_SSE1, dr_SSE1)));
-            t2_SSE0            = _mm_sub_pd(t2_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
-                                                       _mm_mul_pd(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_pd(t2_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
-                                                       _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-
-            t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_pd(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-
-            t3_SSE0            = _mm_sub_pd(t3_SSE0,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_pd(t3_SSE1,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
-            t1_SSE0            = _mm_mul_pd(rinv_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_pd(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_pd(rinv_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_pd(t2_SSE1, t3_SSE1)));
-
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-            dadx += 2;
-        }
-
-        /* Epilogue part, including exclusion mask */
-        for (j = nj2; j < nj3; j += UNROLLJ)
-        {
-            jmask_SSE0 = _mm_load_pd((double *)emask0);
-            jmask_SSE1 = _mm_load_pd((double *)emask1);
-            emask0    += 2*UNROLLJ;
-            emask1    += 2*UNROLLJ;
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_pd(x_align+j);
-            jy_SSE            = _mm_load_pd(y_align+j);
-            jz_SSE            = _mm_load_pd(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
-
-            /* Combine masks */
-            jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
-            jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
-            rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
-
-            dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
-            dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
-
-            sk_aj_SSE          = _mm_load_pd(obc_param+j);
-            raj_SSE            = _mm_load_pd(gb_radius+j);
-
-            raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
-            t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
-            t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
-            t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
-            t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
-            t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
-
-            obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
-            obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
-            obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
-            obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
-
-            uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
-            lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
-                                              _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
-            lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
-                                              _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
-
-            dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
-            uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
-            uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
-            lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
-            lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
-            diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
-            lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
-            sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
-            prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
-            logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-
-            t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
-            t2_SSE0            = _mm_mul_pd(diff2_SSE0,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_pd(diff2_SSE1,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-
-            t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
-            t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
-            t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
-            t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
-            t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
-            t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
-            sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-
-            t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
-                                            _mm_mul_pd(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
-                                            _mm_mul_pd(prod_SSE1, lij3_SSE1));
-            t1_SSE0            = _mm_sub_pd(t1_SSE0,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_pd(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_pd(t1_SSE1,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-
-            t2_SSE0            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_pd(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_pd(uij3_SSE1, dr_SSE1)));
-            t2_SSE0            = _mm_sub_pd(t2_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
-                                                       _mm_mul_pd(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_pd(t2_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
-                                                       _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-            t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_pd(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-            t3_SSE0            = _mm_sub_pd(t3_SSE0,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_pd(t3_SSE1,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
-            t1_SSE0            = _mm_mul_pd(rinv_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_pd(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_pd(rinv_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_pd(t2_SSE1, t3_SSE1)));
-
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-            dadx += 2;
-
-            /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
-            t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
-            t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
-            t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
-            t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
-
-            obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
-            obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
-            obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
-            obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
-
-            uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
-            lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
-                                              _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
-            lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
-                                              _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
-
-            dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
-
-            uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
-            uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
-            lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
-            lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
-
-            diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
-            lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
-            sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
-            prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
-
-            logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
-            t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
-            t2_SSE0            = _mm_mul_pd(diff2_SSE0,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_pd(diff2_SSE1,
-                                            _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
-            t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
-            t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
-            t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
-            t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
-            t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
-
-            _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
-                                            _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
-                                                       _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
-
-            t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
-                                            _mm_mul_pd(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
-                                            _mm_mul_pd(prod_SSE1, lij3_SSE1));
-
-            t1_SSE0            = _mm_sub_pd(t1_SSE0,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_pd(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_pd(t1_SSE1,
-                                            _mm_mul_pd(onefourth_SSE,
-                                                       _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_pd(lij3_SSE1, dr_SSE1))));
-            t2_SSE0            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_pd(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_pd(onefourth_SSE,
-                                            _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_pd(uij3_SSE1, dr_SSE1)));
-            t2_SSE0            = _mm_sub_pd(t2_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
-                                                       _mm_mul_pd(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_pd(t2_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
-                                                       _mm_mul_pd(prod_SSE1, uij3_SSE1)));
-
-            t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_pd(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_pd(rinv_SSE1, rinv_SSE1));
-
-            t3_SSE0            = _mm_sub_pd(t3_SSE0,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_pd(t3_SSE1,
-                                            _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_pd(one_SSE,
-                                                                  _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
-
-            t1_SSE0            = _mm_mul_pd(rinv_SSE0,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_pd(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_pd(rinv_SSE1,
-                                            _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_pd(t2_SSE1, t3_SSE1)));
-
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
-            dadx += 2;
-        }
-        GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1);
-        sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1);
-        _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
-    }
-
-
-    for (i = 0; i < natoms/2+1; i++)
-    {
-        work[i] += work[natoms+i];
-    }
-
-    /* Parallel summations would go here if ever implemented in DD */
-
-    if (gb_algorithm == egbHCT)
-    {
-        /* HCT */
-        for (i = 0; i < natoms; i++)
-        {
-            if (born->use[i] != 0)
-            {
-                rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
-                sum_ai  = 1.0/rai - work[i];
-                min_rad = rai + born->gb_doffset;
-                rad     = 1.0/sum_ai;
-
-                born->bRad[i]   = rad > min_rad ? rad : min_rad;
-                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-            }
-        }
-
-    }
-    else
-    {
-        /* OBC */
-
-        /* Calculate the radii */
-        for (i = 0; i < natoms; i++)
-        {
-
-            if (born->use[i] != 0)
-            {
-                rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
-                rai_inv2   = 1.0/rai;
-                rai        = rai-born->gb_doffset;
-                rai_inv    = 1.0/rai;
-                sum_ai     = rai * work[i];
-                sum_ai2    = sum_ai  * sum_ai;
-                sum_ai3    = sum_ai2 * sum_ai;
-
-                tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
-                born->bRad[i] = rai_inv - tsum*rai_inv2;
-                born->bRad[i] = 1.0 / born->bRad[i];
-
-                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-
-                tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
-                born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
-            }
-        }
-    }
-
-    return 0;
-}
-
-
-
-
-
-
-
-
-int
-genborn_allvsall_calc_chainrule_sse2_double(t_forcerec   *           fr,
-                                            t_mdatoms   *            mdatoms,
-                                            gmx_genborn_t   *        born,
-                                            double *                 x,
-                                            double *                 f,
-                                            int                      gb_algorithm,
-                                            void   *                 paadata)
-{
-    gmx_allvsallgb2_data_t *aadata;
-    int                     natoms;
-    int                     ni0, ni1;
-    int                     nj0, nj1, nj2, nj3;
-    int                     i, j, k, n;
-    int                     idx;
-    int              *      mask;
-    int              *      pmask0;
-    int              *      emask0;
-    int              *      jindex;
-
-    double                  ix, iy, iz;
-    double                  fix, fiy, fiz;
-    double                  jx, jy, jz;
-    double                  dx, dy, dz;
-    double                  tx, ty, tz;
-    double                  rbai, rbaj, fgb, fgb_ai, rbi;
-    double            *     rb;
-    double            *     dadx;
-    double            *     x_align;
-    double            *     y_align;
-    double            *     z_align;
-    double            *     fx_align;
-    double            *     fy_align;
-    double            *     fz_align;
-    double                  tmpsum[2];
-
-    __m128d                 jmask_SSE0, jmask_SSE1;
-    __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
-    __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
-    __m128d                 fix_SSE0, fiy_SSE0, fiz_SSE0;
-    __m128d                 fix_SSE1, fiy_SSE1, fiz_SSE1;
-    __m128d                 rbai_SSE0, rbai_SSE1;
-    __m128d                 imask_SSE0, imask_SSE1;
-    __m128d                 jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
-    __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
-    __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
-    __m128d                 fgb_SSE0, fgb_ai_SSE0;
-    __m128d                 fgb_SSE1, fgb_ai_SSE1;
-    __m128d                 tx_SSE0, ty_SSE0, tz_SSE0;
-    __m128d                 tx_SSE1, ty_SSE1, tz_SSE1;
-    __m128d                 t1, t2, tmpSSE;
-
-    natoms              = mdatoms->nr;
-    ni0                 = 0;
-    ni1                 = mdatoms->homenr;
-
-    aadata = (gmx_allvsallgb2_data_t *)paadata;
-
-    x_align  = aadata->x_align;
-    y_align  = aadata->y_align;
-    z_align  = aadata->z_align;
-    fx_align = aadata->fx_align;
-    fy_align = aadata->fy_align;
-    fz_align = aadata->fz_align;
-
-    jindex    = aadata->jindex_gb;
-    dadx      = fr->dadx;
-
-    n  = 0;
-    rb = aadata->work;
-
-    /* Loop to get the proper form for the Born radius term */
-    if (gb_algorithm == egbSTILL)
-    {
-        for (i = 0; i < natoms; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
-        }
-    }
-    else if (gb_algorithm == egbHCT)
-    {
-        for (i = 0; i < natoms; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = rbi * rbi * fr->dvda[i];
-        }
-    }
-    else if (gb_algorithm == egbOBC)
-    {
-        for (idx = 0; idx < natoms; idx++)
-        {
-            rbi     = born->bRad[idx];
-            rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
-        }
-    }
-
-    for (i = 0; i < 2*natoms; i++)
-    {
-        fx_align[i]       = 0;
-        fy_align[i]       = 0;
-        fz_align[i]       = 0;
-    }
-
-
-    for (i = 0; i < natoms; i++)
-    {
-        rb[i+natoms] = rb[i];
-    }
-
-    for (i = ni0; i < ni1; i += UNROLLI)
-    {
-        /* We assume shifts are NOT used for all-vs-all interactions */
-
-        /* Load i atom data */
-        ix_SSE0          = _mm_load1_pd(x_align+i);
-        iy_SSE0          = _mm_load1_pd(y_align+i);
-        iz_SSE0          = _mm_load1_pd(z_align+i);
-        ix_SSE1          = _mm_load1_pd(x_align+i+1);
-        iy_SSE1          = _mm_load1_pd(y_align+i+1);
-        iz_SSE1          = _mm_load1_pd(z_align+i+1);
-
-        fix_SSE0         = _mm_setzero_pd();
-        fiy_SSE0         = _mm_setzero_pd();
-        fiz_SSE0         = _mm_setzero_pd();
-        fix_SSE1         = _mm_setzero_pd();
-        fiy_SSE1         = _mm_setzero_pd();
-        fiz_SSE1         = _mm_setzero_pd();
-
-        rbai_SSE0        = _mm_load1_pd(rb+i);
-        rbai_SSE1        = _mm_load1_pd(rb+i+1);
-
-        /* Load limits for loop over neighbors */
-        nj0              = jindex[4*i];
-        nj3              = jindex[4*i+3];
-
-        /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
-        for (j = nj0; j < nj3; j += UNROLLJ)
-        {
-            /* load j atom coordinates */
-            jx_SSE           = _mm_load_pd(x_align+j);
-            jy_SSE           = _mm_load_pd(y_align+j);
-            jz_SSE           = _mm_load_pd(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0          = _mm_sub_pd(ix_SSE0, jx_SSE);
-            dy_SSE0          = _mm_sub_pd(iy_SSE0, jy_SSE);
-            dz_SSE0          = _mm_sub_pd(iz_SSE0, jz_SSE);
-            dx_SSE1          = _mm_sub_pd(ix_SSE1, jx_SSE);
-            dy_SSE1          = _mm_sub_pd(iy_SSE1, jy_SSE);
-            dz_SSE1          = _mm_sub_pd(iz_SSE1, jz_SSE);
-
-            rbaj_SSE         = _mm_load_pd(rb+j);
-
-            fgb_SSE0         = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx));
-            dadx            += 2;
-            fgb_SSE1         = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx));
-            dadx            += 2;
-
-            fgb_ai_SSE0      = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
-            dadx            += 2;
-            fgb_ai_SSE1      = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
-            dadx            += 2;
-
-            /* Total force between ai and aj is the sum of ai->aj and aj->ai */
-            fgb_SSE0         = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0);
-            fgb_SSE1         = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1);
-
-            /* Calculate temporary vectorial force */
-            tx_SSE0            = _mm_mul_pd(fgb_SSE0, dx_SSE0);
-            ty_SSE0            = _mm_mul_pd(fgb_SSE0, dy_SSE0);
-            tz_SSE0            = _mm_mul_pd(fgb_SSE0, dz_SSE0);
-            tx_SSE1            = _mm_mul_pd(fgb_SSE1, dx_SSE1);
-            ty_SSE1            = _mm_mul_pd(fgb_SSE1, dy_SSE1);
-            tz_SSE1            = _mm_mul_pd(fgb_SSE1, dz_SSE1);
-
-            /* Increment i atom force */
-            fix_SSE0          = _mm_add_pd(fix_SSE0, tx_SSE0);
-            fiy_SSE0          = _mm_add_pd(fiy_SSE0, ty_SSE0);
-            fiz_SSE0          = _mm_add_pd(fiz_SSE0, tz_SSE0);
-            fix_SSE1          = _mm_add_pd(fix_SSE1, tx_SSE1);
-            fiy_SSE1          = _mm_add_pd(fiy_SSE1, ty_SSE1);
-            fiz_SSE1          = _mm_add_pd(fiz_SSE1, tz_SSE1);
-
-            /* Decrement j atom force */
-            _mm_store_pd(fx_align+j,
-                         _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) ));
-            _mm_store_pd(fy_align+j,
-                         _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) ));
-            _mm_store_pd(fz_align+j,
-                         _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) ));
-        }
-
-        /* Add i forces to mem */
-        GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1);
-        fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1);
-        _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
-
-        GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1);
-        fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1);
-        _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
-
-        GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1);
-        fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1);
-        _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
-    }
-
-    for (i = 0; i < natoms; i++)
-    {
-        f[3*i]       += fx_align[i] + fx_align[natoms+i];
-        f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
-        f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
-    }
-
-    return 0;
-}
-
-#else
-/* dummy variable when not using SSE */
-int genborn_allvsall_sse2_double_dummy;
-
-
-#endif
diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_double.h b/src/gromacs/mdlib/genborn_allvsall_sse2_double.h
deleted file mode 100644 (file)
index 3629475..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2010,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _GENBORN_ALLVSALL_SSE2_DOUBLE_H
-#define _GENBORN_ALLVSALL_SSE2_DOUBLE_H
-
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/legacyheaders/types/simple.h"
-
-int
-genborn_allvsall_calc_still_radii_sse2_double(t_forcerec *           fr,
-                                              t_mdatoms *            mdatoms,
-                                              gmx_genborn_t *        born,
-                                              gmx_localtop_t *       top,
-                                              double *               x,
-                                              t_commrec *            cr,
-                                              void *                 work);
-
-int
-genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec *           fr,
-                                                t_mdatoms *            mdatoms,
-                                                gmx_genborn_t *        born,
-                                                int                    gb_algorithm,
-                                                gmx_localtop_t *       top,
-                                                double *               x,
-                                                t_commrec *            cr,
-                                                void *                 work);
-
-int
-genborn_allvsall_calc_chainrule_sse2_double(t_forcerec *           fr,
-                                            t_mdatoms *            mdatoms,
-                                            gmx_genborn_t *        born,
-                                            double *               x,
-                                            double *               f,
-                                            int                    gb_algorithm,
-                                            void *                 work);
-
-#endif
diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_single.c b/src/gromacs/mdlib/genborn_allvsall_sse2_single.c
deleted file mode 100644 (file)
index 8c3ce47..0000000
+++ /dev/null
@@ -1,3500 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2012,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include <math.h>
-
-#include "gromacs/legacyheaders/genborn.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/types/simple.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/genborn_allvsall.h"
-#include "gromacs/utility/smalloc.h"
-
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-
-#include <gmx_sse2_single.h>
-
-
-#define SIMD_WIDTH 4
-#define UNROLLI    4
-#define UNROLLJ    4
-
-
-
-
-
-
-
-
-
-typedef struct
-{
-    int *      jindex_gb;
-    int **     prologue_mask_gb;
-    int **     epilogue_mask;
-    int *      imask;
-    real *     gb_radius;
-    real *     workparam;
-    real *     work;
-    real *     x_align;
-    real *     y_align;
-    real *     z_align;
-    real *     fx_align;
-    real *     fy_align;
-    real *     fz_align;
-}
-gmx_allvsallgb2_data_t;
-
-
-static int
-calc_maxoffset(int i, int natoms)
-{
-    int maxoffset;
-
-    if ((natoms % 2) == 1)
-    {
-        /* Odd number of atoms, easy */
-        maxoffset = natoms/2;
-    }
-    else if ((natoms % 4) == 0)
-    {
-        /* Multiple of four is hard */
-        if (i < natoms/2)
-        {
-            if ((i % 2) == 0)
-            {
-                maxoffset = natoms/2;
-            }
-            else
-            {
-                maxoffset = natoms/2-1;
-            }
-        }
-        else
-        {
-            if ((i % 2) == 1)
-            {
-                maxoffset = natoms/2;
-            }
-            else
-            {
-                maxoffset = natoms/2-1;
-            }
-        }
-    }
-    else
-    {
-        /* natoms/2 = odd */
-        if ((i % 2) == 0)
-        {
-            maxoffset = natoms/2;
-        }
-        else
-        {
-            maxoffset = natoms/2-1;
-        }
-    }
-
-    return maxoffset;
-}
-
-static void
-setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
-                                t_ilist     *                  ilist,
-                                int                            start,
-                                int                            end,
-                                int                            natoms,
-                                gmx_bool                       bInclude12,
-                                gmx_bool                       bInclude13,
-                                gmx_bool                       bInclude14)
-{
-    int   i, j, k, tp;
-    int   a1, a2;
-    int   ni0, ni1, nj0, nj1, nj;
-    int   imin, imax, iexcl;
-    int   max_offset;
-    int   max_excl_offset;
-    int   firstinteraction;
-    int   ibase;
-    int  *pi;
-
-    /* This routine can appear to be a bit complex, but it is mostly book-keeping.
-     * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
-     * whether they should interact or not.
-     *
-     * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
-     * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
-     * we create a jindex array with three elements per i atom: the starting point, the point to
-     * which we need to check exclusions, and the end point.
-     * This way we only have to allocate a short exclusion mask per i atom.
-     */
-
-    ni0 = (start/UNROLLI)*UNROLLI;
-    ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
-
-    /* Set the interaction mask to only enable the i atoms we want to include */
-    snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
-    aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-    for (i = 0; i < natoms+UNROLLI; i++)
-    {
-        aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
-    }
-
-    /* Allocate memory for our modified jindex array */
-    snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
-    for (i = 0; i < 4*(natoms+UNROLLI); i++)
-    {
-        aadata->jindex_gb[i] = 0;
-    }
-
-    /* Create the exclusion masks for the prologue part */
-    snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
-
-    /* First zero everything to avoid uninitialized data */
-    for (i = 0; i < natoms+UNROLLI; i++)
-    {
-        aadata->prologue_mask_gb[i] = NULL;
-    }
-
-    /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
-    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
-    {
-        max_excl_offset = -1;
-
-        /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
-        imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
-
-        /* Which atom is the first we (might) interact with? */
-        imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
-        for (i = ibase; i < imax; i++)
-        {
-            /* Before exclusions, which atom is the first we (might) interact with? */
-            firstinteraction = i+1;
-            max_offset       = calc_maxoffset(i, natoms);
-
-            if (!bInclude12)
-            {
-                for (j = 0; j < ilist[F_GB12].nr; j += 3)
-                {
-                    a1 = ilist[F_GB12].iatoms[j+1];
-                    a2 = ilist[F_GB12].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k == firstinteraction)
-                    {
-                        firstinteraction++;
-                    }
-                }
-            }
-            if (!bInclude13)
-            {
-                for (j = 0; j < ilist[F_GB13].nr; j += 3)
-                {
-                    a1 = ilist[F_GB13].iatoms[j+1];
-                    a2 = ilist[F_GB13].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k == firstinteraction)
-                    {
-                        firstinteraction++;
-                    }
-                }
-            }
-            if (!bInclude14)
-            {
-                for (j = 0; j < ilist[F_GB14].nr; j += 3)
-                {
-                    a1 = ilist[F_GB14].iatoms[j+1];
-                    a2 = ilist[F_GB14].iatoms[j+2];
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k == firstinteraction)
-                    {
-                        firstinteraction++;
-                    }
-                }
-            }
-            imin = (firstinteraction < imin) ? firstinteraction : imin;
-        }
-        /* round down to j unrolling factor */
-        imin = (imin/UNROLLJ)*UNROLLJ;
-
-        for (i = ibase; i < imax; i++)
-        {
-            max_offset = calc_maxoffset(i, natoms);
-
-            if (!bInclude12)
-            {
-                for (j = 0; j < ilist[F_GB12].nr; j += 3)
-                {
-                    a1 = ilist[F_GB12].iatoms[j+1];
-                    a2 = ilist[F_GB12].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k < imin)
-                    {
-                        k += natoms;
-                    }
-
-                    if (k > i+max_offset)
-                    {
-                        continue;
-                    }
-
-                    k = k - imin;
-
-                    if (k+natoms <= max_offset)
-                    {
-                        k += natoms;
-                    }
-                    max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
-                }
-            }
-            if (!bInclude13)
-            {
-                for (j = 0; j < ilist[F_GB13].nr; j += 3)
-                {
-                    a1 = ilist[F_GB13].iatoms[j+1];
-                    a2 = ilist[F_GB13].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k < imin)
-                    {
-                        k += natoms;
-                    }
-
-                    if (k > i+max_offset)
-                    {
-                        continue;
-                    }
-
-                    k = k - imin;
-
-                    if (k+natoms <= max_offset)
-                    {
-                        k += natoms;
-                    }
-                    max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
-                }
-            }
-            if (!bInclude14)
-            {
-                for (j = 0; j < ilist[F_GB14].nr; j += 3)
-                {
-                    a1 = ilist[F_GB14].iatoms[j+1];
-                    a2 = ilist[F_GB14].iatoms[j+2];
-
-                    if (a1 == i)
-                    {
-                        k = a2;
-                    }
-                    else if (a2 == i)
-                    {
-                        k = a1;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (k < imin)
-                    {
-                        k += natoms;
-                    }
-
-                    if (k > i+max_offset)
-                    {
-                        continue;
-                    }
-
-                    k = k - imin;
-
-                    if (k+natoms <= max_offset)
-                    {
-                        k += natoms;
-                    }
-                    max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
-                }
-            }
-        }
-
-        /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
-        max_excl_offset++;
-        /* round up to j unrolling factor */
-        max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
-
-        /* Set all the prologue masks length to this value (even for i>end) */
-        for (i = ibase; i < ibase+UNROLLI; i++)
-        {
-            aadata->jindex_gb[4*i]   = imin;
-            aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
-        }
-    }
-
-    /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
-    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
-    {
-        for (i = ibase; i < ibase+UNROLLI; i++)
-        {
-            nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
-            imin = aadata->jindex_gb[4*i];
-
-            /* Allocate aligned memory */
-            snew(pi, nj+2*SIMD_WIDTH);
-            aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-
-            max_offset = calc_maxoffset(i, natoms);
-
-            /* Include interactions i+1 <= j < i+maxoffset */
-            for (k = 0; k < nj; k++)
-            {
-                j = imin + k;
-
-                if ( (j > i) && (j <= i+max_offset) )
-                {
-                    aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
-                }
-                else
-                {
-                    aadata->prologue_mask_gb[i][k] = 0;
-                }
-            }
-
-            /* Clear out the explicit exclusions */
-            if (i < end)
-            {
-                if (!bInclude12)
-                {
-                    for (j = 0; j < ilist[F_GB12].nr; j += 3)
-                    {
-                        a1 = ilist[F_GB12].iatoms[j+1];
-                        a2 = ilist[F_GB12].iatoms[j+2];
-
-                        if (a1 == i)
-                        {
-                            k = a2;
-                        }
-                        else if (a2 == i)
-                        {
-                            k = a1;
-                        }
-                        else
-                        {
-                            continue;
-                        }
-
-                        if (k > i+max_offset)
-                        {
-                            continue;
-                        }
-                        k = k-i;
-
-                        if (k+natoms <= max_offset)
-                        {
-                            k += natoms;
-                        }
-
-                        k = k+i-imin;
-                        if (k >= 0)
-                        {
-                            aadata->prologue_mask_gb[i][k] = 0;
-                        }
-                    }
-                }
-                if (!bInclude13)
-                {
-                    for (j = 0; j < ilist[F_GB13].nr; j += 3)
-                    {
-                        a1 = ilist[F_GB13].iatoms[j+1];
-                        a2 = ilist[F_GB13].iatoms[j+2];
-
-                        if (a1 == i)
-                        {
-                            k = a2;
-                        }
-                        else if (a2 == i)
-                        {
-                            k = a1;
-                        }
-                        else
-                        {
-                            continue;
-                        }
-
-                        if (k > i+max_offset)
-                        {
-                            continue;
-                        }
-                        k = k-i;
-
-                        if (k+natoms <= max_offset)
-                        {
-                            k += natoms;
-                        }
-
-                        k = k+i-imin;
-                        if (k >= 0)
-                        {
-                            aadata->prologue_mask_gb[i][k] = 0;
-                        }
-                    }
-                }
-                if (!bInclude14)
-                {
-                    for (j = 0; j < ilist[F_GB14].nr; j += 3)
-                    {
-                        a1 = ilist[F_GB14].iatoms[j+1];
-                        a2 = ilist[F_GB14].iatoms[j+2];
-
-                        if (a1 == i)
-                        {
-                            k = a2;
-                        }
-                        else if (a2 == i)
-                        {
-                            k = a1;
-                        }
-                        else
-                        {
-                            continue;
-                        }
-
-                        if (k > i+max_offset)
-                        {
-                            continue;
-                        }
-                        k = k-i;
-
-                        if (k+natoms <= max_offset)
-                        {
-                            k += natoms;
-                        }
-
-                        k = k+i-imin;
-                        if (k >= 0)
-                        {
-                            aadata->prologue_mask_gb[i][k] = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    /* Construct the epilogue mask - this just contains the check for maxoffset */
-    snew(aadata->epilogue_mask, natoms+UNROLLI);
-
-    /* First zero everything to avoid uninitialized data */
-    for (i = 0; i < natoms+UNROLLI; i++)
-    {
-        aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
-        aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
-        aadata->epilogue_mask[i]    = NULL;
-    }
-
-    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
-    {
-        /* Find the lowest index for which we need to use the epilogue */
-        imin       = ibase;
-        max_offset = calc_maxoffset(imin, natoms);
-
-        imin = imin + 1 + max_offset;
-
-        /* Find largest index for which we need to use the epilogue */
-        imax = ibase + UNROLLI-1;
-        imax = (imax < end) ? imax : end;
-
-        max_offset = calc_maxoffset(imax, natoms);
-        imax       = imax + 1 + max_offset + UNROLLJ - 1;
-
-        for (i = ibase; i < ibase+UNROLLI; i++)
-        {
-            /* Start of epilogue - round down to j tile limit */
-            aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
-            /* Make sure we dont overlap - for small systems everything is done in the prologue */
-            aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
-            /* Round upwards to j tile limit */
-            aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
-            /* Make sure we dont have a negative range for the epilogue */
-            aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
-        }
-    }
-
-    /* And fill it with data... */
-
-    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
-    {
-        for (i = ibase; i < ibase+UNROLLI; i++)
-        {
-
-            nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
-
-            /* Allocate aligned memory */
-            snew(pi, nj+2*SIMD_WIDTH);
-            aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-
-            max_offset = calc_maxoffset(i, natoms);
-
-            for (k = 0; k < nj; k++)
-            {
-                j = aadata->jindex_gb[4*i+2] + k;
-                aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
-            }
-        }
-    }
-}
-
-
-static void
-genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
-                       gmx_localtop_t     *           top,
-                       gmx_genborn_t     *            born,
-                       t_mdatoms     *                mdatoms,
-                       real                           radius_offset,
-                       int                            gb_algorithm,
-                       gmx_bool                       bInclude12,
-                       gmx_bool                       bInclude13,
-                       gmx_bool                       bInclude14)
-{
-    int                     i, j, idx;
-    int                     natoms;
-    gmx_allvsallgb2_data_t *aadata;
-    real                   *p;
-
-    natoms = mdatoms->nr;
-
-    snew(aadata, 1);
-    *p_aadata = aadata;
-
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-    snew(p, 2*natoms+2*SIMD_WIDTH);
-    aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
-    aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
-    aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
-    aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    for (i = 0; i < mdatoms->nr; i++)
-    {
-        aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
-        if (gb_algorithm == egbSTILL)
-        {
-            aadata->workparam[i] = born->vsolv[i];
-        }
-        else if (gb_algorithm == egbOBC)
-        {
-            aadata->workparam[i] = born->param[i];
-        }
-        aadata->work[i]      = 0.0;
-    }
-    for (i = 0; i < mdatoms->nr; i++)
-    {
-        aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
-        aadata->workparam[natoms+i] = aadata->workparam[i];
-        aadata->work[natoms+i]      = aadata->work[i];
-    }
-
-    for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
-    {
-        aadata->x_align[i]  = 0.0;
-        aadata->y_align[i]  = 0.0;
-        aadata->z_align[i]  = 0.0;
-        aadata->fx_align[i] = 0.0;
-        aadata->fy_align[i] = 0.0;
-        aadata->fz_align[i] = 0.0;
-    }
-
-    setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
-                                    bInclude12, bInclude13, bInclude14);
-}
-
-
-int
-genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
-                                              t_mdatoms *            mdatoms,
-                                              gmx_genborn_t *        born,
-                                              gmx_localtop_t *       top,
-                                              real *                 x,
-                                              t_commrec *            cr,
-                                              void *                 paadata)
-{
-    gmx_allvsallgb2_data_t *aadata;
-    int                     natoms;
-    int                     ni0, ni1;
-    int                     nj0, nj1, nj2, nj3;
-    int                     i, j, k, n;
-    int              *      mask;
-    int              *      pmask0;
-    int              *      pmask1;
-    int              *      pmask2;
-    int              *      pmask3;
-    int              *      emask0;
-    int              *      emask1;
-    int              *      emask2;
-    int              *      emask3;
-    real                    ix, iy, iz;
-    real                    jx, jy, jz;
-    real                    dx, dy, dz;
-    real                    rsq, rinv;
-    real                    gpi, rai, vai;
-    real                    prod_ai;
-    real                    irsq, idr4, idr6;
-    real                    raj, rvdw, ratio;
-    real                    vaj, ccf, dccf, theta, cosq;
-    real                    term, prod, icf4, icf6, gpi2, factor, sinq;
-    real              *     gb_radius;
-    real              *     vsolv;
-    real              *     work;
-    real                    tmpsum[4];
-    real              *     x_align;
-    real              *     y_align;
-    real              *     z_align;
-    int              *      jindex;
-    real              *     dadx;
-
-    __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
-    __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
-    __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
-    __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
-    __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
-    __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
-    __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
-    __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
-    __m128                  imask_SSE0, jmask_SSE0;
-    __m128                  imask_SSE1, jmask_SSE1;
-    __m128                  imask_SSE2, jmask_SSE2;
-    __m128                  imask_SSE3, jmask_SSE3;
-    __m128                  jx_SSE, jy_SSE, jz_SSE;
-    __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
-    __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
-    __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
-    __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
-    __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
-    __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
-    __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
-    __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
-    __m128                  raj_SSE, vaj_SSE, prod_SSE;
-    __m128                  rvdw_SSE0, ratio_SSE0;
-    __m128                  rvdw_SSE1, ratio_SSE1;
-    __m128                  rvdw_SSE2, ratio_SSE2;
-    __m128                  rvdw_SSE3, ratio_SSE3;
-    __m128                  theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
-    __m128                  theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
-    __m128                  theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
-    __m128                  theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
-    __m128                  ccf_SSE0, dccf_SSE0;
-    __m128                  ccf_SSE1, dccf_SSE1;
-    __m128                  ccf_SSE2, dccf_SSE2;
-    __m128                  ccf_SSE3, dccf_SSE3;
-    __m128                  icf4_SSE0, icf6_SSE0;
-    __m128                  icf4_SSE1, icf6_SSE1;
-    __m128                  icf4_SSE2, icf6_SSE2;
-    __m128                  icf4_SSE3, icf6_SSE3;
-    __m128                  half_SSE, one_SSE, two_SSE, four_SSE;
-    __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
-
-    natoms              = mdatoms->nr;
-    ni0                 = 0;
-    ni1                 = mdatoms->homenr;
-
-    n = 0;
-
-    aadata = *((gmx_allvsallgb2_data_t **)paadata);
-
-
-    if (aadata == NULL)
-    {
-        genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
-                               egbSTILL, FALSE, FALSE, TRUE);
-        *((gmx_allvsallgb2_data_t **)paadata) = aadata;
-    }
-
-    x_align = aadata->x_align;
-    y_align = aadata->y_align;
-    z_align = aadata->z_align;
-
-    gb_radius = aadata->gb_radius;
-    vsolv     = aadata->workparam;
-    work      = aadata->work;
-    jindex    = aadata->jindex_gb;
-    dadx      = fr->dadx;
-
-    still_p4_SSE    = _mm_set1_ps(STILL_P4);
-    still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
-    still_pip5_SSE  = _mm_set1_ps(STILL_PIP5);
-    half_SSE        = _mm_set1_ps(0.5);
-    one_SSE         = _mm_set1_ps(1.0);
-    two_SSE         = _mm_set1_ps(2.0);
-    four_SSE        = _mm_set1_ps(4.0);
-
-    /* This will be summed, so it has to extend to natoms + buffer */
-    for (i = 0; i < natoms+1+natoms/2; i++)
-    {
-        work[i] = 0;
-    }
-
-    for (i = ni0; i < ni1+1+natoms/2; i++)
-    {
-        k           = i%natoms;
-        x_align[i]  = x[3*k];
-        y_align[i]  = x[3*k+1];
-        z_align[i]  = x[3*k+2];
-        work[i]     = 0;
-    }
-
-
-    for (i = ni0; i < ni1; i += UNROLLI)
-    {
-        /* We assume shifts are NOT used for all-vs-all interactions */
-
-        /* Load i atom data */
-        ix_SSE0          = _mm_load1_ps(x_align+i);
-        iy_SSE0          = _mm_load1_ps(y_align+i);
-        iz_SSE0          = _mm_load1_ps(z_align+i);
-        ix_SSE1          = _mm_load1_ps(x_align+i+1);
-        iy_SSE1          = _mm_load1_ps(y_align+i+1);
-        iz_SSE1          = _mm_load1_ps(z_align+i+1);
-        ix_SSE2          = _mm_load1_ps(x_align+i+2);
-        iy_SSE2          = _mm_load1_ps(y_align+i+2);
-        iz_SSE2          = _mm_load1_ps(z_align+i+2);
-        ix_SSE3          = _mm_load1_ps(x_align+i+3);
-        iy_SSE3          = _mm_load1_ps(y_align+i+3);
-        iz_SSE3          = _mm_load1_ps(z_align+i+3);
-
-        gpi_SSE0         = _mm_setzero_ps();
-        gpi_SSE1         = _mm_setzero_ps();
-        gpi_SSE2         = _mm_setzero_ps();
-        gpi_SSE3         = _mm_setzero_ps();
-
-        rai_SSE0         = _mm_load1_ps(gb_radius+i);
-        rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
-        rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
-        rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
-
-        prod_ai_SSE0     = _mm_set1_ps(STILL_P4*vsolv[i]);
-        prod_ai_SSE1     = _mm_set1_ps(STILL_P4*vsolv[i+1]);
-        prod_ai_SSE2     = _mm_set1_ps(STILL_P4*vsolv[i+2]);
-        prod_ai_SSE3     = _mm_set1_ps(STILL_P4*vsolv[i+3]);
-
-        /* Load limits for loop over neighbors */
-        nj0              = jindex[4*i];
-        nj1              = jindex[4*i+1];
-        nj2              = jindex[4*i+2];
-        nj3              = jindex[4*i+3];
-
-        pmask0           = aadata->prologue_mask_gb[i];
-        pmask1           = aadata->prologue_mask_gb[i+1];
-        pmask2           = aadata->prologue_mask_gb[i+2];
-        pmask3           = aadata->prologue_mask_gb[i+3];
-        emask0           = aadata->epilogue_mask[i];
-        emask1           = aadata->epilogue_mask[i+1];
-        emask2           = aadata->epilogue_mask[i+2];
-        emask3           = aadata->epilogue_mask[i+3];
-
-        imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
-        imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
-        imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
-        imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
-
-        /* Prologue part, including exclusion mask */
-        for (j = nj0; j < nj1; j += UNROLLJ)
-        {
-            jmask_SSE0 = _mm_load_ps((real *)pmask0);
-            jmask_SSE1 = _mm_load_ps((real *)pmask1);
-            jmask_SSE2 = _mm_load_ps((real *)pmask2);
-            jmask_SSE3 = _mm_load_ps((real *)pmask3);
-            pmask0    += UNROLLJ;
-            pmask1    += UNROLLJ;
-            pmask2    += UNROLLJ;
-            pmask3    += UNROLLJ;
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_ps(x_align+j);
-            jy_SSE            = _mm_load_ps(y_align+j);
-            jz_SSE            = _mm_load_ps(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            /* Combine masks */
-            jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
-            jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
-            jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
-            jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
-            rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
-            rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
-
-            irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
-            irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
-            irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
-            irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
-            idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
-            idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
-            idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
-            idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
-            idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
-            idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
-            idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
-            idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
-
-            raj_SSE            = _mm_load_ps(gb_radius+j);
-            vaj_SSE            = _mm_load_ps(vsolv+j);
-
-            rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
-            rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
-            rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
-            rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
-
-            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
-            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
-            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
-
-            ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
-            ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
-            ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
-            theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
-            theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
-            theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
-            gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
-            gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
-            gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
-            gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
-            term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
-            term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
-            term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
-            term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
-            ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
-            ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
-            ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
-            ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
-            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
-                                            _mm_mul_ps(sinq_SSE0, theta_SSE0));
-            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
-                                            _mm_mul_ps(sinq_SSE1, theta_SSE1));
-            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
-                                            _mm_mul_ps(sinq_SSE2, theta_SSE2));
-            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
-                                            _mm_mul_ps(sinq_SSE3, theta_SSE3));
-
-            prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
-            icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
-            icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
-            icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
-            icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
-            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
-            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
-
-            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
-                                                           _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
-                                                           _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
-                                                           _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
-
-            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
-            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
-            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
-            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
-
-            /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
-            dadx += 4;
-
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
-            dadx += 4;
-        }
-
-        /* Main part, no exclusions */
-        for (j = nj1; j < nj2; j += UNROLLJ)
-        {
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_ps(x_align+j);
-            jy_SSE            = _mm_load_ps(y_align+j);
-            jz_SSE            = _mm_load_ps(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
-            rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
-            rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
-
-            irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
-            irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
-            irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
-            irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
-            idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
-            idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
-            idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
-            idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
-            idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
-            idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
-            idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
-            idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
-
-            raj_SSE            = _mm_load_ps(gb_radius+j);
-
-            rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
-            rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
-            rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
-            rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
-            vaj_SSE            = _mm_load_ps(vsolv+j);
-
-            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
-            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
-            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
-
-            ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
-            ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
-            ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
-            theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
-            theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
-            theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
-            gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
-            gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
-            gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
-            gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
-            term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
-            term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
-            term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
-            term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
-            ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
-            ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
-            ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
-            ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
-            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
-                                            _mm_mul_ps(sinq_SSE0, theta_SSE0));
-            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
-                                            _mm_mul_ps(sinq_SSE1, theta_SSE1));
-            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
-                                            _mm_mul_ps(sinq_SSE2, theta_SSE2));
-            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
-                                            _mm_mul_ps(sinq_SSE3, theta_SSE3));
-
-            prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE );
-            icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
-            icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
-            icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
-            icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
-            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
-            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
-
-            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
-                                                           _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
-                                                           _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
-                                                           _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
-
-            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
-            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
-            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
-            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
-
-            /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
-            dadx += 4;
-
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
-            dadx += 4;
-        }
-        /* Epilogue part, including exclusion mask */
-        for (j = nj2; j < nj3; j += UNROLLJ)
-        {
-            jmask_SSE0 = _mm_load_ps((real *)emask0);
-            jmask_SSE1 = _mm_load_ps((real *)emask1);
-            jmask_SSE2 = _mm_load_ps((real *)emask2);
-            jmask_SSE3 = _mm_load_ps((real *)emask3);
-            emask0    += UNROLLJ;
-            emask1    += UNROLLJ;
-            emask2    += UNROLLJ;
-            emask3    += UNROLLJ;
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_ps(x_align+j);
-            jy_SSE            = _mm_load_ps(y_align+j);
-            jz_SSE            = _mm_load_ps(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            /* Combine masks */
-            jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
-            jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
-            jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
-            jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
-            rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
-            rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
-
-            irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
-            irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
-            irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
-            irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
-            idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
-            idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
-            idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
-            idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
-            idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
-            idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
-            idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
-            idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
-
-            raj_SSE            = _mm_load_ps(gb_radius+j);
-            vaj_SSE            = _mm_load_ps(vsolv+j);
-
-            rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
-            rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
-            rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
-            rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
-
-            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
-            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
-            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
-
-            ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
-            ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
-            ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
-            theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
-            theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
-            theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
-            gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
-            gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
-            gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
-            gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
-            term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
-            term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
-            term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
-            term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
-            ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
-            ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
-            ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
-            ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
-            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
-                                            _mm_mul_ps(sinq_SSE0, theta_SSE0));
-            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
-                                            _mm_mul_ps(sinq_SSE1, theta_SSE1));
-            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
-                                            _mm_mul_ps(sinq_SSE2, theta_SSE2));
-            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
-                                            _mm_mul_ps(sinq_SSE3, theta_SSE3));
-
-            prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
-            icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
-            icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
-            icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
-            icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
-            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
-            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
-            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
-
-            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
-                                                           _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
-                                                           _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
-                                                           _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
-
-            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
-            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
-            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
-            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
-
-            /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
-            dadx += 4;
-
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
-            dadx += 4;
-        }
-        _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
-        gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
-        gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
-        gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
-        _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
-    }
-
-    /* In case we have written anything beyond natoms, move it back.
-     * Never mind that we leave stuff above natoms; that will not
-     * be accessed later in the routine.
-     * In principle this should be a move rather than sum, but this
-     * way we dont have to worry about even/odd offsets...
-     */
-    for (i = natoms; i < ni1+1+natoms/2; i++)
-    {
-        work[i-natoms] += work[i];
-    }
-
-    /* Parallel summations would go here if ever implemented with DD */
-
-    factor  = 0.5 * ONE_4PI_EPS0;
-    /* Calculate the radii - should we do all atoms, or just our local ones? */
-    for (i = 0; i < natoms; i++)
-    {
-        if (born->use[i] != 0)
-        {
-            gpi             = born->gpol[i]+work[i];
-            gpi2            = gpi * gpi;
-            born->bRad[i]   = factor*gmx_invsqrt(gpi2);
-            fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-        }
-    }
-
-    return 0;
-}
-
-
-
-int
-genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
-                                                t_mdatoms *            mdatoms,
-                                                gmx_genborn_t *        born,
-                                                int                    gb_algorithm,
-                                                gmx_localtop_t *       top,
-                                                real *                 x,
-                                                t_commrec *            cr,
-                                                void *                 paadata)
-{
-    gmx_allvsallgb2_data_t *aadata;
-    int                     natoms;
-    int                     ni0, ni1;
-    int                     nj0, nj1, nj2, nj3;
-    int                     i, j, k, n;
-    int              *      mask;
-    int              *      pmask0;
-    int              *      pmask1;
-    int              *      pmask2;
-    int              *      pmask3;
-    int              *      emask0;
-    int              *      emask1;
-    int              *      emask2;
-    int              *      emask3;
-    real              *     gb_radius;
-    real              *     vsolv;
-    real              *     work;
-    real                    tmpsum[4];
-    real              *     x_align;
-    real              *     y_align;
-    real              *     z_align;
-    int              *      jindex;
-    real              *     dadx;
-    real              *     obc_param;
-    real                    rad, min_rad;
-    real                    rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
-
-    __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
-    __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
-    __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
-    __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
-    __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
-    __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
-    __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
-    __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
-    __m128                  imask_SSE0, jmask_SSE0;
-    __m128                  imask_SSE1, jmask_SSE1;
-    __m128                  imask_SSE2, jmask_SSE2;
-    __m128                  imask_SSE3, jmask_SSE3;
-    __m128                  jx_SSE, jy_SSE, jz_SSE;
-    __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
-    __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
-    __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
-    __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
-    __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
-    __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
-    __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
-    __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
-    __m128                  raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
-    __m128                  ccf_SSE0, dccf_SSE0, prod_SSE0;
-    __m128                  ccf_SSE1, dccf_SSE1, prod_SSE1;
-    __m128                  ccf_SSE2, dccf_SSE2, prod_SSE2;
-    __m128                  ccf_SSE3, dccf_SSE3, prod_SSE3;
-    __m128                  icf4_SSE0, icf6_SSE0;
-    __m128                  icf4_SSE1, icf6_SSE1;
-    __m128                  icf4_SSE2, icf6_SSE2;
-    __m128                  icf4_SSE3, icf6_SSE3;
-    __m128                  oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
-    __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
-    __m128                  rai_inv_SSE0;
-    __m128                  rai_inv_SSE1;
-    __m128                  rai_inv_SSE2;
-    __m128                  rai_inv_SSE3;
-    __m128                  sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
-    __m128                  sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
-    __m128                  sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
-    __m128                  sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
-    __m128                  lij_inv_SSE0, sk2_rinv_SSE0;
-    __m128                  lij_inv_SSE1, sk2_rinv_SSE1;
-    __m128                  lij_inv_SSE2, sk2_rinv_SSE2;
-    __m128                  lij_inv_SSE3, sk2_rinv_SSE3;
-    __m128                  dr_SSE0;
-    __m128                  dr_SSE1;
-    __m128                  dr_SSE2;
-    __m128                  dr_SSE3;
-    __m128                  t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
-    __m128                  t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
-    __m128                  t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
-    __m128                  t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
-    __m128                  obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
-    __m128                  obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
-    __m128                  obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
-    __m128                  obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
-    __m128                  uij_SSE0, uij2_SSE0, uij3_SSE0;
-    __m128                  uij_SSE1, uij2_SSE1, uij3_SSE1;
-    __m128                  uij_SSE2, uij2_SSE2, uij3_SSE2;
-    __m128                  uij_SSE3, uij2_SSE3, uij3_SSE3;
-    __m128                  lij_SSE0, lij2_SSE0, lij3_SSE0;
-    __m128                  lij_SSE1, lij2_SSE1, lij3_SSE1;
-    __m128                  lij_SSE2, lij2_SSE2, lij3_SSE2;
-    __m128                  lij_SSE3, lij2_SSE3, lij3_SSE3;
-    __m128                  dlij_SSE0, diff2_SSE0, logterm_SSE0;
-    __m128                  dlij_SSE1, diff2_SSE1, logterm_SSE1;
-    __m128                  dlij_SSE2, diff2_SSE2, logterm_SSE2;
-    __m128                  dlij_SSE3, diff2_SSE3, logterm_SSE3;
-    __m128                  doffset_SSE;
-
-    natoms              = mdatoms->nr;
-    ni0                 = 0;
-    ni1                 = mdatoms->homenr;
-
-    n = 0;
-
-    aadata = *((gmx_allvsallgb2_data_t **)paadata);
-
-
-    if (aadata == NULL)
-    {
-        genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
-                               egbOBC, TRUE, TRUE, TRUE);
-        *((gmx_allvsallgb2_data_t **)paadata) = aadata;
-    }
-
-    x_align = aadata->x_align;
-    y_align = aadata->y_align;
-    z_align = aadata->z_align;
-
-    gb_radius = aadata->gb_radius;
-    work      = aadata->work;
-    jindex    = aadata->jindex_gb;
-    dadx      = fr->dadx;
-    obc_param = aadata->workparam;
-
-    oneeighth_SSE   = _mm_set1_ps(0.125);
-    onefourth_SSE   = _mm_set1_ps(0.25);
-    half_SSE        = _mm_set1_ps(0.5);
-    one_SSE         = _mm_set1_ps(1.0);
-    two_SSE         = _mm_set1_ps(2.0);
-    four_SSE        = _mm_set1_ps(4.0);
-    doffset_SSE     = _mm_set1_ps(born->gb_doffset);
-
-    for (i = 0; i < natoms; i++)
-    {
-        x_align[i]  = x[3*i];
-        y_align[i]  = x[3*i+1];
-        z_align[i]  = x[3*i+2];
-    }
-
-    /* Copy again */
-    for (i = 0; i < natoms/2+1; i++)
-    {
-        x_align[natoms+i]  = x_align[i];
-        y_align[natoms+i]  = y_align[i];
-        z_align[natoms+i]  = z_align[i];
-    }
-
-    for (i = 0; i < natoms+natoms/2+1; i++)
-    {
-        work[i] = 0;
-    }
-
-    for (i = ni0; i < ni1; i += UNROLLI)
-    {
-        /* We assume shifts are NOT used for all-vs-all interactions */
-
-        /* Load i atom data */
-        ix_SSE0          = _mm_load1_ps(x_align+i);
-        iy_SSE0          = _mm_load1_ps(y_align+i);
-        iz_SSE0          = _mm_load1_ps(z_align+i);
-        ix_SSE1          = _mm_load1_ps(x_align+i+1);
-        iy_SSE1          = _mm_load1_ps(y_align+i+1);
-        iz_SSE1          = _mm_load1_ps(z_align+i+1);
-        ix_SSE2          = _mm_load1_ps(x_align+i+2);
-        iy_SSE2          = _mm_load1_ps(y_align+i+2);
-        iz_SSE2          = _mm_load1_ps(z_align+i+2);
-        ix_SSE3          = _mm_load1_ps(x_align+i+3);
-        iy_SSE3          = _mm_load1_ps(y_align+i+3);
-        iz_SSE3          = _mm_load1_ps(z_align+i+3);
-
-        rai_SSE0         = _mm_load1_ps(gb_radius+i);
-        rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
-        rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
-        rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
-        rai_inv_SSE0     = gmx_mm_inv_ps(rai_SSE0);
-        rai_inv_SSE1     = gmx_mm_inv_ps(rai_SSE1);
-        rai_inv_SSE2     = gmx_mm_inv_ps(rai_SSE2);
-        rai_inv_SSE3     = gmx_mm_inv_ps(rai_SSE3);
-
-        sk_ai_SSE0       = _mm_load1_ps(obc_param+i);
-        sk_ai_SSE1       = _mm_load1_ps(obc_param+i+1);
-        sk_ai_SSE2       = _mm_load1_ps(obc_param+i+2);
-        sk_ai_SSE3       = _mm_load1_ps(obc_param+i+3);
-        sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
-        sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
-        sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
-        sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
-
-        sum_ai_SSE0      = _mm_setzero_ps();
-        sum_ai_SSE1      = _mm_setzero_ps();
-        sum_ai_SSE2      = _mm_setzero_ps();
-        sum_ai_SSE3      = _mm_setzero_ps();
-
-        /* Load limits for loop over neighbors */
-        nj0              = jindex[4*i];
-        nj1              = jindex[4*i+1];
-        nj2              = jindex[4*i+2];
-        nj3              = jindex[4*i+3];
-
-        pmask0           = aadata->prologue_mask_gb[i];
-        pmask1           = aadata->prologue_mask_gb[i+1];
-        pmask2           = aadata->prologue_mask_gb[i+2];
-        pmask3           = aadata->prologue_mask_gb[i+3];
-        emask0           = aadata->epilogue_mask[i];
-        emask1           = aadata->epilogue_mask[i+1];
-        emask2           = aadata->epilogue_mask[i+2];
-        emask3           = aadata->epilogue_mask[i+3];
-
-        imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
-        imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
-        imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
-        imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
-
-        /* Prologue part, including exclusion mask */
-        for (j = nj0; j < nj1; j += UNROLLJ)
-        {
-            jmask_SSE0 = _mm_load_ps((real *)pmask0);
-            jmask_SSE1 = _mm_load_ps((real *)pmask1);
-            jmask_SSE2 = _mm_load_ps((real *)pmask2);
-            jmask_SSE3 = _mm_load_ps((real *)pmask3);
-            pmask0    += UNROLLJ;
-            pmask1    += UNROLLJ;
-            pmask2    += UNROLLJ;
-            pmask3    += UNROLLJ;
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_ps(x_align+j);
-            jy_SSE            = _mm_load_ps(y_align+j);
-            jz_SSE            = _mm_load_ps(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            /* Combine masks */
-            jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
-            jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
-            jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
-            jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
-            rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
-            rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
-
-            dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
-            dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
-            dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
-            dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
-
-            sk_aj_SSE          = _mm_load_ps(obc_param+j);
-            raj_SSE            = _mm_load_ps(gb_radius+j);
-            raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
-            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
-            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
-            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
-            t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
-
-            obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
-            obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
-            obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
-            obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
-            obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
-            obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
-            obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
-            obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
-            obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
-
-            uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
-            uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
-            uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
-                                              _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
-                                              _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
-                                              _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
-                                              _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
-            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
-            uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
-            uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
-            uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
-            lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
-            lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
-            lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
-            lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
-            lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
-            lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-
-            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
-            t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
-                                                       prod_SSE2));
-            t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
-                                                       prod_SSE3));
-
-            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
-            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
-            t1_SSE0            = _mm_sub_ps(t1_SSE0,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_ps(t1_SSE1,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
-            t1_SSE2            = _mm_sub_ps(t1_SSE2,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
-            t1_SSE3            = _mm_sub_ps(t1_SSE3,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-
-            t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
-            t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
-            t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
-            t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
-            t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
-            t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-            t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
-            t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
-            t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
-            t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
-            t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
-            t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
-
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-            dadx += 4;
-
-            /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
-            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
-            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
-            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
-            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
-
-            obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
-            obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
-            obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
-            obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
-            obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
-            obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
-            obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
-            obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
-            obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
-
-            uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
-            uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
-            uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
-                                              _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
-                                              _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
-                                              _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
-                                              _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
-            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
-            uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
-            uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
-            uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
-            lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
-            lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
-            lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
-            lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
-            lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
-            lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
-            t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
-                                                       prod_SSE2));
-            t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
-                                                       prod_SSE3));
-            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
-            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
-                                                           _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
-                                                           _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
-                                                           _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
-
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
-            t1_SSE0            = _mm_sub_ps(t1_SSE0,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_ps(t1_SSE1,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
-            t1_SSE2            = _mm_sub_ps(t1_SSE2,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
-            t1_SSE3            = _mm_sub_ps(t1_SSE3,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-            t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
-            t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
-            t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
-            t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
-            t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
-            t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-
-            t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
-            t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
-            t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
-
-            t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
-            t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
-            t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
-
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-            dadx += 4;
-
-        }
-
-        /* Main part, no exclusions */
-        for (j = nj1; j < nj2; j += UNROLLJ)
-        {
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_ps(x_align+j);
-            jy_SSE            = _mm_load_ps(y_align+j);
-            jz_SSE            = _mm_load_ps(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
-            rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
-            rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
-
-            dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
-            dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
-            dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
-            dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
-
-            sk_aj_SSE          = _mm_load_ps(obc_param+j);
-            raj_SSE            = _mm_load_ps(gb_radius+j);
-
-            raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
-            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
-            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
-            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
-            t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
-
-            obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
-            obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
-            obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
-            obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
-            obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
-            obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
-            obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
-            obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
-            obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
-
-            uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
-            uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
-            uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
-                                              _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
-                                              _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
-                                              _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
-                                              _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
-            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
-            uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
-            uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
-            uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
-            lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
-            lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
-            lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
-            lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
-            lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
-            lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-
-            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
-            t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
-                                                       prod_SSE2));
-            t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
-                                                       prod_SSE3));
-
-            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
-            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
-            t1_SSE0            = _mm_sub_ps(t1_SSE0,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_ps(t1_SSE1,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
-            t1_SSE2            = _mm_sub_ps(t1_SSE2,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
-            t1_SSE3            = _mm_sub_ps(t1_SSE3,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-
-            t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
-            t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
-            t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
-            t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
-            t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
-            t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-            t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
-            t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
-            t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
-            t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
-            t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
-            t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
-
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-            dadx += 4;
-
-            /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
-            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
-            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
-            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
-            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
-
-            obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
-            obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
-            obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
-            obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
-            obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
-            obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
-            obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
-            obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
-            obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
-
-            uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
-            uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
-            uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
-                                              _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
-                                              _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
-                                              _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
-                                              _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
-            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
-            uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
-            uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
-            uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
-            lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
-            lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
-            lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
-            lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
-            lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
-            lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
-            t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
-                                                       prod_SSE2));
-            t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
-                                                       prod_SSE3));
-            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
-            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
-                                                           _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
-                                                           _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
-                                                           _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
-
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
-            t1_SSE0            = _mm_sub_ps(t1_SSE0,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_ps(t1_SSE1,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
-            t1_SSE2            = _mm_sub_ps(t1_SSE2,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
-            t1_SSE3            = _mm_sub_ps(t1_SSE3,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-            t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
-            t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
-            t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
-            t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
-            t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
-            t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-
-            t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
-            t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
-            t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
-            t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
-            t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
-            t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
-
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-            dadx += 4;
-        }
-
-        /* Epilogue part, including exclusion mask */
-        for (j = nj2; j < nj3; j += UNROLLJ)
-        {
-            jmask_SSE0 = _mm_load_ps((real *)emask0);
-            jmask_SSE1 = _mm_load_ps((real *)emask1);
-            jmask_SSE2 = _mm_load_ps((real *)emask2);
-            jmask_SSE3 = _mm_load_ps((real *)emask3);
-            emask0    += UNROLLJ;
-            emask1    += UNROLLJ;
-            emask2    += UNROLLJ;
-            emask3    += UNROLLJ;
-
-            /* load j atom coordinates */
-            jx_SSE            = _mm_load_ps(x_align+j);
-            jy_SSE            = _mm_load_ps(y_align+j);
-            jz_SSE            = _mm_load_ps(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
-
-            /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
-
-            /* Combine masks */
-            jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
-            jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
-            jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
-            jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
-
-            /* Calculate 1/r and 1/r2 */
-            rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
-            rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
-            rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
-            rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-
-            /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
-
-            dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
-            dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
-            dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
-            dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
-
-            sk_aj_SSE          = _mm_load_ps(obc_param+j);
-            raj_SSE            = _mm_load_ps(gb_radius+j);
-
-            raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
-            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
-            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
-            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
-            t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
-
-            obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
-            obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
-            obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
-            obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
-            obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
-            obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
-            obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
-            obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
-            obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
-
-            uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
-            uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
-            uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
-                                              _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
-                                              _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
-                                              _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
-                                              _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
-            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
-            uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
-            uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
-            uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
-            lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
-            lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
-            lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
-            lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
-            lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
-            lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-
-            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
-            t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
-                                                       prod_SSE2));
-            t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
-                                                       prod_SSE3));
-
-            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
-            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
-            t1_SSE0            = _mm_sub_ps(t1_SSE0,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_ps(t1_SSE1,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
-            t1_SSE2            = _mm_sub_ps(t1_SSE2,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
-            t1_SSE3            = _mm_sub_ps(t1_SSE3,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-
-            t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
-            t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
-            t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
-            t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
-            t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
-            t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-            t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
-            t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
-            t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
-            t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
-            t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
-            t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
-
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-            dadx += 4;
-
-            /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
-            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
-            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
-            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
-            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
-
-            obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
-            obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
-            obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
-            obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
-            obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
-            obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
-            obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
-            obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
-            obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
-            obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
-            obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
-            obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
-
-            uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
-            uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
-            uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
-            uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
-                                              _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
-                                              _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
-                                              _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
-                                              _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
-            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
-
-            uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
-            uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
-            uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
-            uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
-            lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
-            lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
-            lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
-            lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
-
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
-            lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
-            lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
-            lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
-            lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
-
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
-            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
-            t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
-                                                       prod_SSE0));
-            t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
-                                                       prod_SSE1));
-            t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
-                                                       prod_SSE2));
-            t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
-                                                       prod_SSE3));
-            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
-
-            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
-                                                           _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
-                                                           _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
-                                                           _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
-
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
-            t1_SSE0            = _mm_sub_ps(t1_SSE0,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
-            t1_SSE1            = _mm_sub_ps(t1_SSE1,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
-            t1_SSE2            = _mm_sub_ps(t1_SSE2,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
-            t1_SSE3            = _mm_sub_ps(t1_SSE3,
-                                            _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
-            t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
-            t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
-            t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
-            t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
-            t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
-            t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
-            t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
-            t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
-
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
-
-            t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
-            t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
-            t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
-            t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
-                                                       _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
-
-
-            t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
-            t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
-            t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
-            t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
-
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
-            dadx += 4;
-        }
-        _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
-        sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
-        sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
-        sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
-        _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
-    }
-
-
-    for (i = 0; i < natoms/2+1; i++)
-    {
-        work[i] += work[natoms+i];
-    }
-
-    /* Parallel summations would go here if ever implemented with DD */
-
-    if (gb_algorithm == egbHCT)
-    {
-        /* HCT */
-        for (i = 0; i < natoms; i++)
-        {
-            if (born->use[i] != 0)
-            {
-                rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
-                sum_ai  = 1.0/rai - work[i];
-                min_rad = rai + born->gb_doffset;
-                rad     = 1.0/sum_ai;
-
-                born->bRad[i]   = rad > min_rad ? rad : min_rad;
-                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-            }
-        }
-
-    }
-    else
-    {
-        /* OBC */
-
-        /* Calculate the radii */
-        for (i = 0; i < natoms; i++)
-        {
-
-            if (born->use[i] != 0)
-            {
-                rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
-                rai_inv2   = 1.0/rai;
-                rai        = rai-born->gb_doffset;
-                rai_inv    = 1.0/rai;
-                sum_ai     = rai * work[i];
-                sum_ai2    = sum_ai  * sum_ai;
-                sum_ai3    = sum_ai2 * sum_ai;
-
-                tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
-                born->bRad[i] = rai_inv - tsum*rai_inv2;
-                born->bRad[i] = 1.0 / born->bRad[i];
-
-                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-
-                tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
-                born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
-            }
-        }
-    }
-
-    return 0;
-}
-
-
-
-
-
-
-
-
-int
-genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
-                                            t_mdatoms *            mdatoms,
-                                            gmx_genborn_t *        born,
-                                            real *                 x,
-                                            real *                 f,
-                                            int                    gb_algorithm,
-                                            void *                 paadata)
-{
-    gmx_allvsallgb2_data_t *aadata;
-    int                     natoms;
-    int                     ni0, ni1;
-    int                     nj0, nj1, nj2, nj3;
-    int                     i, j, k, n;
-    int                     idx;
-    int              *      mask;
-    int              *      pmask0;
-    int              *      emask0;
-    int              *      jindex;
-
-    real                    ix, iy, iz;
-    real                    fix, fiy, fiz;
-    real                    jx, jy, jz;
-    real                    dx, dy, dz;
-    real                    tx, ty, tz;
-    real                    rbai, rbaj, fgb, fgb_ai, rbi;
-    real              *     rb;
-    real              *     dadx;
-    real              *     x_align;
-    real              *     y_align;
-    real              *     z_align;
-    real              *     fx_align;
-    real              *     fy_align;
-    real              *     fz_align;
-    real                    tmpsum[4];
-
-    __m128                  jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
-    __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
-    __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
-    __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
-    __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
-    __m128                  fix_SSE0, fiy_SSE0, fiz_SSE0;
-    __m128                  fix_SSE1, fiy_SSE1, fiz_SSE1;
-    __m128                  fix_SSE2, fiy_SSE2, fiz_SSE2;
-    __m128                  fix_SSE3, fiy_SSE3, fiz_SSE3;
-    __m128                  rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
-    __m128                  imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
-    __m128                  jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
-    __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
-    __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
-    __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
-    __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
-    __m128                  fgb_SSE0, fgb_ai_SSE0;
-    __m128                  fgb_SSE1, fgb_ai_SSE1;
-    __m128                  fgb_SSE2, fgb_ai_SSE2;
-    __m128                  fgb_SSE3, fgb_ai_SSE3;
-    __m128                  tx_SSE0, ty_SSE0, tz_SSE0;
-    __m128                  tx_SSE1, ty_SSE1, tz_SSE1;
-    __m128                  tx_SSE2, ty_SSE2, tz_SSE2;
-    __m128                  tx_SSE3, ty_SSE3, tz_SSE3;
-    __m128                  t1, t2;
-
-    natoms              = mdatoms->nr;
-    ni0                 = 0;
-    ni1                 = mdatoms->homenr;
-    dadx                = fr->dadx;
-
-    aadata = (gmx_allvsallgb2_data_t *)paadata;
-
-    x_align  = aadata->x_align;
-    y_align  = aadata->y_align;
-    z_align  = aadata->z_align;
-    fx_align = aadata->fx_align;
-    fy_align = aadata->fy_align;
-    fz_align = aadata->fz_align;
-
-    jindex    = aadata->jindex_gb;
-    dadx      = fr->dadx;
-
-    n  = 0;
-    rb = aadata->work;
-
-    /* Loop to get the proper form for the Born radius term */
-    if (gb_algorithm == egbSTILL)
-    {
-        for (i = 0; i < natoms; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
-        }
-    }
-    else if (gb_algorithm == egbHCT)
-    {
-        for (i = 0; i < natoms; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = rbi * rbi * fr->dvda[i];
-        }
-    }
-    else if (gb_algorithm == egbOBC)
-    {
-        for (idx = 0; idx < natoms; idx++)
-        {
-            rbi     = born->bRad[idx];
-            rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
-        }
-    }
-
-    for (i = 0; i < 2*natoms; i++)
-    {
-        fx_align[i]       = 0;
-        fy_align[i]       = 0;
-        fz_align[i]       = 0;
-    }
-
-
-    for (i = 0; i < natoms; i++)
-    {
-        rb[i+natoms] = rb[i];
-    }
-
-    for (i = ni0; i < ni1; i += UNROLLI)
-    {
-        /* We assume shifts are NOT used for all-vs-all interactions */
-
-        /* Load i atom data */
-        ix_SSE0          = _mm_load1_ps(x_align+i);
-        iy_SSE0          = _mm_load1_ps(y_align+i);
-        iz_SSE0          = _mm_load1_ps(z_align+i);
-        ix_SSE1          = _mm_load1_ps(x_align+i+1);
-        iy_SSE1          = _mm_load1_ps(y_align+i+1);
-        iz_SSE1          = _mm_load1_ps(z_align+i+1);
-        ix_SSE2          = _mm_load1_ps(x_align+i+2);
-        iy_SSE2          = _mm_load1_ps(y_align+i+2);
-        iz_SSE2          = _mm_load1_ps(z_align+i+2);
-        ix_SSE3          = _mm_load1_ps(x_align+i+3);
-        iy_SSE3          = _mm_load1_ps(y_align+i+3);
-        iz_SSE3          = _mm_load1_ps(z_align+i+3);
-
-        fix_SSE0         = _mm_setzero_ps();
-        fiy_SSE0         = _mm_setzero_ps();
-        fiz_SSE0         = _mm_setzero_ps();
-        fix_SSE1         = _mm_setzero_ps();
-        fiy_SSE1         = _mm_setzero_ps();
-        fiz_SSE1         = _mm_setzero_ps();
-        fix_SSE2         = _mm_setzero_ps();
-        fiy_SSE2         = _mm_setzero_ps();
-        fiz_SSE2         = _mm_setzero_ps();
-        fix_SSE3         = _mm_setzero_ps();
-        fiy_SSE3         = _mm_setzero_ps();
-        fiz_SSE3         = _mm_setzero_ps();
-
-        rbai_SSE0        = _mm_load1_ps(rb+i);
-        rbai_SSE1        = _mm_load1_ps(rb+i+1);
-        rbai_SSE2        = _mm_load1_ps(rb+i+2);
-        rbai_SSE3        = _mm_load1_ps(rb+i+3);
-
-        /* Load limits for loop over neighbors */
-        nj0              = jindex[4*i];
-        nj3              = jindex[4*i+3];
-
-        /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
-        for (j = nj0; j < nj3; j += UNROLLJ)
-        {
-            /* load j atom coordinates */
-            jx_SSE           = _mm_load_ps(x_align+j);
-            jy_SSE           = _mm_load_ps(y_align+j);
-            jz_SSE           = _mm_load_ps(z_align+j);
-
-            /* Calculate distance */
-            dx_SSE0          = _mm_sub_ps(ix_SSE0, jx_SSE);
-            dy_SSE0          = _mm_sub_ps(iy_SSE0, jy_SSE);
-            dz_SSE0          = _mm_sub_ps(iz_SSE0, jz_SSE);
-            dx_SSE1          = _mm_sub_ps(ix_SSE1, jx_SSE);
-            dy_SSE1          = _mm_sub_ps(iy_SSE1, jy_SSE);
-            dz_SSE1          = _mm_sub_ps(iz_SSE1, jz_SSE);
-            dx_SSE2          = _mm_sub_ps(ix_SSE2, jx_SSE);
-            dy_SSE2          = _mm_sub_ps(iy_SSE2, jy_SSE);
-            dz_SSE2          = _mm_sub_ps(iz_SSE2, jz_SSE);
-            dx_SSE3          = _mm_sub_ps(ix_SSE3, jx_SSE);
-            dy_SSE3          = _mm_sub_ps(iy_SSE3, jy_SSE);
-            dz_SSE3          = _mm_sub_ps(iz_SSE3, jz_SSE);
-
-            rbaj_SSE         = _mm_load_ps(rb+j);
-
-            fgb_SSE0         = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
-            dadx            += 4;
-            fgb_SSE1         = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
-            dadx            += 4;
-            fgb_SSE2         = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
-            dadx            += 4;
-            fgb_SSE3         = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
-            dadx            += 4;
-
-            fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
-            dadx            += 4;
-            fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
-            dadx            += 4;
-            fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
-            dadx            += 4;
-            fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
-            dadx            += 4;
-
-            /* Total force between ai and aj is the sum of ai->aj and aj->ai */
-            fgb_SSE0         = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
-            fgb_SSE1         = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
-            fgb_SSE2         = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
-            fgb_SSE3         = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
-
-            /* Calculate temporary vectorial force */
-            tx_SSE0            = _mm_mul_ps(fgb_SSE0, dx_SSE0);
-            ty_SSE0            = _mm_mul_ps(fgb_SSE0, dy_SSE0);
-            tz_SSE0            = _mm_mul_ps(fgb_SSE0, dz_SSE0);
-            tx_SSE1            = _mm_mul_ps(fgb_SSE1, dx_SSE1);
-            ty_SSE1            = _mm_mul_ps(fgb_SSE1, dy_SSE1);
-            tz_SSE1            = _mm_mul_ps(fgb_SSE1, dz_SSE1);
-            tx_SSE2            = _mm_mul_ps(fgb_SSE2, dx_SSE2);
-            ty_SSE2            = _mm_mul_ps(fgb_SSE2, dy_SSE2);
-            tz_SSE2            = _mm_mul_ps(fgb_SSE2, dz_SSE2);
-            tx_SSE3            = _mm_mul_ps(fgb_SSE3, dx_SSE3);
-            ty_SSE3            = _mm_mul_ps(fgb_SSE3, dy_SSE3);
-            tz_SSE3            = _mm_mul_ps(fgb_SSE3, dz_SSE3);
-
-            /* Increment i atom force */
-            fix_SSE0          = _mm_add_ps(fix_SSE0, tx_SSE0);
-            fiy_SSE0          = _mm_add_ps(fiy_SSE0, ty_SSE0);
-            fiz_SSE0          = _mm_add_ps(fiz_SSE0, tz_SSE0);
-            fix_SSE1          = _mm_add_ps(fix_SSE1, tx_SSE1);
-            fiy_SSE1          = _mm_add_ps(fiy_SSE1, ty_SSE1);
-            fiz_SSE1          = _mm_add_ps(fiz_SSE1, tz_SSE1);
-            fix_SSE2          = _mm_add_ps(fix_SSE2, tx_SSE2);
-            fiy_SSE2          = _mm_add_ps(fiy_SSE2, ty_SSE2);
-            fiz_SSE2          = _mm_add_ps(fiz_SSE2, tz_SSE2);
-            fix_SSE3          = _mm_add_ps(fix_SSE3, tx_SSE3);
-            fiy_SSE3          = _mm_add_ps(fiy_SSE3, ty_SSE3);
-            fiz_SSE3          = _mm_add_ps(fiz_SSE3, tz_SSE3);
-
-            /* Decrement j atom force */
-            _mm_store_ps(fx_align+j,
-                         _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
-            _mm_store_ps(fy_align+j,
-                         _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
-            _mm_store_ps(fz_align+j,
-                         _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
-        }
-        /* Add i forces to mem and shifted force list */
-        _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
-        fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
-        fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
-        fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
-        _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
-
-        _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
-        fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
-        fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
-        fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
-        _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
-
-        _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
-        fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
-        fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
-        fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
-        _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
-    }
-
-    for (i = 0; i < natoms; i++)
-    {
-        f[3*i]       += fx_align[i] + fx_align[natoms+i];
-        f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
-        f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
-    }
-
-    return 0;
-}
-
-#else
-/* dummy variable when not using SSE */
-int genborn_allvsall_sse2_single_dummy;
-
-
-#endif
diff --git a/src/gromacs/mdlib/genborn_allvsall_sse2_single.h b/src/gromacs/mdlib/genborn_allvsall_sse2_single.h
deleted file mode 100644 (file)
index d1e908a..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team.
- * Copyright (c) 2010,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _GENBORN_ALLVSALL_SSE2_SINGLE_H
-#define _GENBORN_ALLVSALL_SSE2_SINGLE_H
-
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/legacyheaders/types/simple.h"
-
-int
-genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
-                                              t_mdatoms *            mdatoms,
-                                              gmx_genborn_t *        born,
-                                              gmx_localtop_t *       top,
-                                              real *                 x,
-                                              t_commrec *            cr,
-                                              void *                 work);
-
-int
-genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
-                                                t_mdatoms *            mdatoms,
-                                                gmx_genborn_t *        born,
-                                                int                    gb_algorithm,
-                                                gmx_localtop_t *       top,
-                                                real *                 x,
-                                                t_commrec *            cr,
-                                                void *                 work);
-
-int
-genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
-                                            t_mdatoms *            mdatoms,
-                                            gmx_genborn_t *        born,
-                                            real *                 x,
-                                            real *                 f,
-                                            int                    gb_algorithm,
-                                            void *                 work);
-
-#endif
diff --git a/src/gromacs/mdlib/genborn_sse2_double.c b/src/gromacs/mdlib/genborn_sse2_double.c
deleted file mode 100644 (file)
index 62cab4b..0000000
+++ /dev/null
@@ -1,918 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include <math.h>
-#include <string.h>
-
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/fileio/pdbio.h"
-#include "gromacs/legacyheaders/genborn.h"
-#include "gromacs/legacyheaders/names.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/smalloc.h"
-
-/* Only compile this file if SSE2 intrinsics are available */
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-#include "genborn_sse2_double.h"
-
-#include <emmintrin.h>
-#include <gmx_sse2_double.h>
-
-int
-calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
-                              int natoms, gmx_localtop_t *top,
-                              double *x, t_nblist *nl,
-                              gmx_genborn_t *born)
-{
-    int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
-    int           jnrA, jnrB, j3A, j3B;
-    int          *mdtype;
-    double        shX, shY, shZ;
-    int          *jjnr;
-    double       *shiftvec;
-
-    double        gpi_ai, gpi2;
-    double        factor;
-    double       *gb_radius;
-    double       *vsolv;
-    double       *work;
-    double       *dadx;
-
-    __m128d       ix, iy, iz;
-    __m128d       jx, jy, jz;
-    __m128d       dx, dy, dz;
-    __m128d       tx, ty, tz;
-    __m128d       rsq, rinv, rinv2, rinv4, rinv6;
-    __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
-    __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
-    __m128d       mask, icf4, icf6, mask_cmp;
-
-    const __m128d half   = _mm_set1_pd(0.5);
-    const __m128d three  = _mm_set1_pd(3.0);
-    const __m128d one    = _mm_set1_pd(1.0);
-    const __m128d two    = _mm_set1_pd(2.0);
-    const __m128d zero   = _mm_set1_pd(0.0);
-    const __m128d four   = _mm_set1_pd(4.0);
-
-    const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
-    const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
-    const __m128d still_p4     = _mm_set1_pd(STILL_P4);
-
-    factor  = 0.5 * ONE_4PI_EPS0;
-
-    gb_radius = born->gb_radius;
-    vsolv     = born->vsolv;
-    work      = born->gpol_still_work;
-    jjnr      = nl->jjnr;
-    shiftvec  = fr->shift_vec[0];
-    dadx      = fr->dadx;
-
-    jnrA = jnrB = 0;
-    jx   = _mm_setzero_pd();
-    jy   = _mm_setzero_pd();
-    jz   = _mm_setzero_pd();
-
-    n = 0;
-
-    for (i = 0; i < natoms; i++)
-    {
-        work[i] = 0;
-    }
-
-    for (i = 0; i < nl->nri; i++)
-    {
-        ii     = nl->iinr[i];
-        ii3    = ii*3;
-        is3    = 3*nl->shift[i];
-        shX    = shiftvec[is3];
-        shY    = shiftvec[is3+1];
-        shZ    = shiftvec[is3+2];
-        nj0    = nl->jindex[i];
-        nj1    = nl->jindex[i+1];
-
-        ix     = _mm_set1_pd(shX+x[ii3+0]);
-        iy     = _mm_set1_pd(shY+x[ii3+1]);
-        iz     = _mm_set1_pd(shZ+x[ii3+2]);
-
-
-        /* Polarization energy for atom ai */
-        gpi    = _mm_setzero_pd();
-
-        rai     = _mm_load1_pd(gb_radius+ii);
-        prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
-
-        for (k = nj0; k < nj1-1; k += 2)
-        {
-            jnrA        = jjnr[k];
-            jnrB        = jjnr[k+1];
-
-            j3A         = 3*jnrA;
-            j3B         = 3*jnrB;
-
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
-
-            GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
-            GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
-
-            dx          = _mm_sub_pd(ix, jx);
-            dy          = _mm_sub_pd(iy, jy);
-            dz          = _mm_sub_pd(iz, jz);
-
-            rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
-            rinv        = gmx_mm_invsqrt_pd(rsq);
-            rinv2       = _mm_mul_pd(rinv, rinv);
-            rinv4       = _mm_mul_pd(rinv2, rinv2);
-            rinv6       = _mm_mul_pd(rinv4, rinv2);
-
-            rvdw        = _mm_add_pd(rai, raj);
-            ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
-
-            mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
-
-            /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
-            if (0 == _mm_movemask_pd(mask_cmp) )
-            {
-                /* if ratio>still_p5inv for ALL elements */
-                ccf         = one;
-                dccf        = _mm_setzero_pd();
-            }
-            else
-            {
-                ratio       = _mm_min_pd(ratio, still_p5inv);
-                theta       = _mm_mul_pd(ratio, still_pip5);
-                gmx_mm_sincos_pd(theta, &sinq, &cosq);
-                term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
-                ccf         = _mm_mul_pd(term, term);
-                dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
-                                         _mm_mul_pd(sinq, theta));
-            }
-
-            prod        = _mm_mul_pd(still_p4, vaj);
-            icf4        = _mm_mul_pd(ccf, rinv4);
-            icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
-
-            GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
-
-            gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
-
-            _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
-            dadx += 2;
-        }
-
-        if (k < nj1)
-        {
-            jnrA        = jjnr[k];
-
-            j3A         = 3*jnrA;
-
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
-
-            GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
-            GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
-
-            dx          = _mm_sub_sd(ix, jx);
-            dy          = _mm_sub_sd(iy, jy);
-            dz          = _mm_sub_sd(iz, jz);
-
-            rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
-            rinv        = gmx_mm_invsqrt_pd(rsq);
-            rinv2       = _mm_mul_sd(rinv, rinv);
-            rinv4       = _mm_mul_sd(rinv2, rinv2);
-            rinv6       = _mm_mul_sd(rinv4, rinv2);
-
-            rvdw        = _mm_add_sd(rai, raj);
-            ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
-
-            mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
-
-            /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
-            if (0 == _mm_movemask_pd(mask_cmp) )
-            {
-                /* if ratio>still_p5inv for ALL elements */
-                ccf         = one;
-                dccf        = _mm_setzero_pd();
-            }
-            else
-            {
-                ratio       = _mm_min_sd(ratio, still_p5inv);
-                theta       = _mm_mul_sd(ratio, still_pip5);
-                gmx_mm_sincos_pd(theta, &sinq, &cosq);
-                term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
-                ccf         = _mm_mul_sd(term, term);
-                dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
-                                         _mm_mul_sd(sinq, theta));
-            }
-
-            prod        = _mm_mul_sd(still_p4, vaj);
-            icf4        = _mm_mul_sd(ccf, rinv4);
-            icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
-
-            GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
-
-            gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
-
-            _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
-            dadx += 2;
-            _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
-            dadx += 2;
-        }
-        gmx_mm_update_1pot_pd(gpi, work+ii);
-    }
-
-    /* Sum up the polarization energy from other nodes */
-    if (DOMAINDECOMP(cr))
-    {
-        dd_atom_sum_real(cr->dd, work);
-    }
-
-    /* Compute the radii */
-    for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
-    {
-        if (born->use[i] != 0)
-        {
-            gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
-            gpi2             = gpi_ai * gpi_ai;
-            born->bRad[i]    = factor*gmx_invsqrt(gpi2);
-            fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
-        }
-    }
-
-    /* Extra (local) communication required for DD */
-    if (DOMAINDECOMP(cr))
-    {
-        dd_atom_spread_real(cr->dd, born->bRad);
-        dd_atom_spread_real(cr->dd, fr->invsqrta);
-    }
-
-    return 0;
-}
-
-
-int
-calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
-                                double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
-{
-    int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
-    int           jnrA, jnrB;
-    int           j3A, j3B;
-    double        shX, shY, shZ;
-    double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
-    double        sum_ai2, sum_ai3, tsum, tchain, doffset;
-    double       *obc_param;
-    double       *gb_radius;
-    double       *work;
-    int        *  jjnr;
-    double       *dadx;
-    double       *shiftvec;
-    double        min_rad, rad;
-
-    __m128d       ix, iy, iz, jx, jy, jz;
-    __m128d       dx, dy, dz, t1, t2, t3, t4;
-    __m128d       rsq, rinv, r;
-    __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
-    __m128d       uij, lij2, uij2, lij3, uij3, diff2;
-    __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
-    __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
-    __m128d       dadx1, dadx2;
-    __m128d       logterm;
-    __m128d       mask;
-    __m128d       obc_mask1, obc_mask2, obc_mask3;
-
-    __m128d       oneeighth   = _mm_set1_pd(0.125);
-    __m128d       onefourth   = _mm_set1_pd(0.25);
-
-    const __m128d half  = _mm_set1_pd(0.5);
-    const __m128d three = _mm_set1_pd(3.0);
-    const __m128d one   = _mm_set1_pd(1.0);
-    const __m128d two   = _mm_set1_pd(2.0);
-    const __m128d zero  = _mm_set1_pd(0.0);
-    const __m128d neg   = _mm_set1_pd(-1.0);
-
-    /* Set the dielectric offset */
-    doffset   = born->gb_doffset;
-    gb_radius = born->gb_radius;
-    obc_param = born->param;
-    work      = born->gpol_hct_work;
-    jjnr      = nl->jjnr;
-    dadx      = fr->dadx;
-    shiftvec  = fr->shift_vec[0];
-
-    jx        = _mm_setzero_pd();
-    jy        = _mm_setzero_pd();
-    jz        = _mm_setzero_pd();
-
-    jnrA = jnrB = 0;
-
-    for (i = 0; i < born->nr; i++)
-    {
-        work[i] = 0;
-    }
-
-    for (i = 0; i < nl->nri; i++)
-    {
-        ii     = nl->iinr[i];
-        ii3    = ii*3;
-        is3    = 3*nl->shift[i];
-        shX    = shiftvec[is3];
-        shY    = shiftvec[is3+1];
-        shZ    = shiftvec[is3+2];
-        nj0    = nl->jindex[i];
-        nj1    = nl->jindex[i+1];
-
-        ix     = _mm_set1_pd(shX+x[ii3+0]);
-        iy     = _mm_set1_pd(shY+x[ii3+1]);
-        iz     = _mm_set1_pd(shZ+x[ii3+2]);
-
-        rai     = _mm_load1_pd(gb_radius+ii);
-        rai_inv = gmx_mm_inv_pd(rai);
-
-        sum_ai = _mm_setzero_pd();
-
-        sk_ai  = _mm_load1_pd(born->param+ii);
-        sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
-
-        for (k = nj0; k < nj1-1; k += 2)
-        {
-            jnrA        = jjnr[k];
-            jnrB        = jjnr[k+1];
-
-            j3A         = 3*jnrA;
-            j3B         = 3*jnrB;
-
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
-            GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
-            GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
-
-            dx    = _mm_sub_pd(ix, jx);
-            dy    = _mm_sub_pd(iy, jy);
-            dz    = _mm_sub_pd(iz, jz);
-
-            rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
-
-            rinv        = gmx_mm_invsqrt_pd(rsq);
-            r           = _mm_mul_pd(rsq, rinv);
-
-            /* Compute raj_inv aj1-4 */
-            raj_inv     = gmx_mm_inv_pd(raj);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1            = _mm_add_pd(r, sk_aj);
-            t2            = _mm_sub_pd(r, sk_aj);
-            t3            = _mm_sub_pd(sk_aj, r);
-            obc_mask1     = _mm_cmplt_pd(rai, t1);
-            obc_mask2     = _mm_cmplt_pd(rai, t2);
-            obc_mask3     = _mm_cmplt_pd(rai, t3);
-
-            uij           = gmx_mm_inv_pd(t1);
-            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
-                                         _mm_andnot_pd(obc_mask2, rai_inv));
-            dlij          = _mm_and_pd(one, obc_mask2);
-            uij2          = _mm_mul_pd(uij, uij);
-            uij3          = _mm_mul_pd(uij2, uij);
-            lij2          = _mm_mul_pd(lij, lij);
-            lij3          = _mm_mul_pd(lij2, lij);
-
-            diff2         = _mm_sub_pd(uij2, lij2);
-            lij_inv       = gmx_mm_invsqrt_pd(lij2);
-            sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
-            sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
-            prod          = _mm_mul_pd(onefourth, sk2_rinv);
-
-            logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
-
-            t1            = _mm_sub_pd(lij, uij);
-            t2            = _mm_mul_pd(diff2,
-                                       _mm_sub_pd(_mm_mul_pd(onefourth, r),
-                                                  prod));
-            t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
-            t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
-            t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
-            t4            = _mm_and_pd(t4, obc_mask3);
-            t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
-
-            sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
-
-            t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
-                                       _mm_mul_pd(prod, lij3));
-            t1            = _mm_sub_pd(t1,
-                                       _mm_mul_pd(onefourth,
-                                                  _mm_add_pd(_mm_mul_pd(lij, rinv),
-                                                             _mm_mul_pd(lij3, r))));
-            t2            = _mm_mul_pd(onefourth,
-                                       _mm_add_pd(_mm_mul_pd(uij, rinv),
-                                                  _mm_mul_pd(uij3, r)));
-            t2            = _mm_sub_pd(t2,
-                                       _mm_add_pd(_mm_mul_pd(half, uij2),
-                                                  _mm_mul_pd(prod, uij3)));
-            t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
-                                       _mm_mul_pd(rinv, rinv));
-            t3            = _mm_sub_pd(t3,
-                                       _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
-                                                  _mm_add_pd(one,
-                                                             _mm_mul_pd(sk2_rinv, rinv))));
-            t1            = _mm_mul_pd(rinv,
-                                       _mm_add_pd(_mm_mul_pd(dlij, t1),
-                                                  _mm_add_pd(t2, t3)));
-
-            dadx1         = _mm_and_pd(t1, obc_mask1);
-
-            /* Evaluate influence of atom ai -> aj */
-            t1            = _mm_add_pd(r, sk_ai);
-            t2            = _mm_sub_pd(r, sk_ai);
-            t3            = _mm_sub_pd(sk_ai, r);
-            obc_mask1     = _mm_cmplt_pd(raj, t1);
-            obc_mask2     = _mm_cmplt_pd(raj, t2);
-            obc_mask3     = _mm_cmplt_pd(raj, t3);
-
-            uij           = gmx_mm_inv_pd(t1);
-            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
-                                         _mm_andnot_pd(obc_mask2, raj_inv));
-            dlij          = _mm_and_pd(one, obc_mask2);
-            uij2          = _mm_mul_pd(uij, uij);
-            uij3          = _mm_mul_pd(uij2, uij);
-            lij2          = _mm_mul_pd(lij, lij);
-            lij3          = _mm_mul_pd(lij2, lij);
-
-            diff2         = _mm_sub_pd(uij2, lij2);
-            lij_inv       = gmx_mm_invsqrt_pd(lij2);
-            sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
-            prod          = _mm_mul_pd(onefourth, sk2_rinv);
-
-            logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
-
-            t1            = _mm_sub_pd(lij, uij);
-            t2            = _mm_mul_pd(diff2,
-                                       _mm_sub_pd(_mm_mul_pd(onefourth, r),
-                                                  prod));
-            t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
-            t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
-            t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
-            t4            = _mm_and_pd(t4, obc_mask3);
-            t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
-
-            GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
-
-            t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
-                                       _mm_mul_pd(prod, lij3));
-            t1            = _mm_sub_pd(t1,
-                                       _mm_mul_pd(onefourth,
-                                                  _mm_add_pd(_mm_mul_pd(lij, rinv),
-                                                             _mm_mul_pd(lij3, r))));
-            t2            = _mm_mul_pd(onefourth,
-                                       _mm_add_pd(_mm_mul_pd(uij, rinv),
-                                                  _mm_mul_pd(uij3, r)));
-            t2            = _mm_sub_pd(t2,
-                                       _mm_add_pd(_mm_mul_pd(half, uij2),
-                                                  _mm_mul_pd(prod, uij3)));
-            t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
-                                       _mm_mul_pd(rinv, rinv));
-            t3            = _mm_sub_pd(t3,
-                                       _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
-                                                  _mm_add_pd(one,
-                                                             _mm_mul_pd(sk2_rinv, rinv))));
-            t1            = _mm_mul_pd(rinv,
-                                       _mm_add_pd(_mm_mul_pd(dlij, t1),
-                                                  _mm_add_pd(t2, t3)));
-
-            dadx2         = _mm_and_pd(t1, obc_mask1);
-
-            _mm_store_pd(dadx, dadx1);
-            dadx += 2;
-            _mm_store_pd(dadx, dadx2);
-            dadx += 2;
-        } /* end normal inner loop */
-
-        if (k < nj1)
-        {
-            jnrA        = jjnr[k];
-
-            j3A         = 3*jnrA;
-
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
-            GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
-            GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
-
-            dx    = _mm_sub_sd(ix, jx);
-            dy    = _mm_sub_sd(iy, jy);
-            dz    = _mm_sub_sd(iz, jz);
-
-            rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
-
-            rinv        = gmx_mm_invsqrt_pd(rsq);
-            r           = _mm_mul_sd(rsq, rinv);
-
-            /* Compute raj_inv aj1-4 */
-            raj_inv     = gmx_mm_inv_pd(raj);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1            = _mm_add_sd(r, sk_aj);
-            t2            = _mm_sub_sd(r, sk_aj);
-            t3            = _mm_sub_sd(sk_aj, r);
-            obc_mask1     = _mm_cmplt_sd(rai, t1);
-            obc_mask2     = _mm_cmplt_sd(rai, t2);
-            obc_mask3     = _mm_cmplt_sd(rai, t3);
-
-            uij           = gmx_mm_inv_pd(t1);
-            lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
-                                      _mm_andnot_pd(obc_mask2, rai_inv));
-            dlij          = _mm_and_pd(one, obc_mask2);
-            uij2          = _mm_mul_sd(uij, uij);
-            uij3          = _mm_mul_sd(uij2, uij);
-            lij2          = _mm_mul_sd(lij, lij);
-            lij3          = _mm_mul_sd(lij2, lij);
-
-            diff2         = _mm_sub_sd(uij2, lij2);
-            lij_inv       = gmx_mm_invsqrt_pd(lij2);
-            sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
-            sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
-            prod          = _mm_mul_sd(onefourth, sk2_rinv);
-
-            logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
-
-            t1            = _mm_sub_sd(lij, uij);
-            t2            = _mm_mul_sd(diff2,
-                                       _mm_sub_sd(_mm_mul_pd(onefourth, r),
-                                                  prod));
-            t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
-            t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
-            t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
-            t4            = _mm_and_pd(t4, obc_mask3);
-            t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
-
-            sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
-
-            t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
-                                       _mm_mul_sd(prod, lij3));
-            t1            = _mm_sub_sd(t1,
-                                       _mm_mul_sd(onefourth,
-                                                  _mm_add_sd(_mm_mul_sd(lij, rinv),
-                                                             _mm_mul_sd(lij3, r))));
-            t2            = _mm_mul_sd(onefourth,
-                                       _mm_add_sd(_mm_mul_sd(uij, rinv),
-                                                  _mm_mul_sd(uij3, r)));
-            t2            = _mm_sub_sd(t2,
-                                       _mm_add_sd(_mm_mul_sd(half, uij2),
-                                                  _mm_mul_sd(prod, uij3)));
-            t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
-                                       _mm_mul_sd(rinv, rinv));
-            t3            = _mm_sub_sd(t3,
-                                       _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
-                                                  _mm_add_sd(one,
-                                                             _mm_mul_sd(sk2_rinv, rinv))));
-            t1            = _mm_mul_sd(rinv,
-                                       _mm_add_sd(_mm_mul_sd(dlij, t1),
-                                                  _mm_add_pd(t2, t3)));
-
-            dadx1         = _mm_and_pd(t1, obc_mask1);
-
-            /* Evaluate influence of atom ai -> aj */
-            t1            = _mm_add_sd(r, sk_ai);
-            t2            = _mm_sub_sd(r, sk_ai);
-            t3            = _mm_sub_sd(sk_ai, r);
-            obc_mask1     = _mm_cmplt_sd(raj, t1);
-            obc_mask2     = _mm_cmplt_sd(raj, t2);
-            obc_mask3     = _mm_cmplt_sd(raj, t3);
-
-            uij           = gmx_mm_inv_pd(t1);
-            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
-                                         _mm_andnot_pd(obc_mask2, raj_inv));
-            dlij          = _mm_and_pd(one, obc_mask2);
-            uij2          = _mm_mul_sd(uij, uij);
-            uij3          = _mm_mul_sd(uij2, uij);
-            lij2          = _mm_mul_sd(lij, lij);
-            lij3          = _mm_mul_sd(lij2, lij);
-
-            diff2         = _mm_sub_sd(uij2, lij2);
-            lij_inv       = gmx_mm_invsqrt_pd(lij2);
-            sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
-            prod          = _mm_mul_sd(onefourth, sk2_rinv);
-
-            logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
-
-            t1            = _mm_sub_sd(lij, uij);
-            t2            = _mm_mul_sd(diff2,
-                                       _mm_sub_sd(_mm_mul_sd(onefourth, r),
-                                                  prod));
-            t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
-            t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
-            t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
-            t4            = _mm_and_pd(t4, obc_mask3);
-            t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
-
-            GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
-
-            t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
-                                       _mm_mul_sd(prod, lij3));
-            t1            = _mm_sub_sd(t1,
-                                       _mm_mul_sd(onefourth,
-                                                  _mm_add_sd(_mm_mul_sd(lij, rinv),
-                                                             _mm_mul_sd(lij3, r))));
-            t2            = _mm_mul_sd(onefourth,
-                                       _mm_add_sd(_mm_mul_sd(uij, rinv),
-                                                  _mm_mul_sd(uij3, r)));
-            t2            = _mm_sub_sd(t2,
-                                       _mm_add_sd(_mm_mul_sd(half, uij2),
-                                                  _mm_mul_sd(prod, uij3)));
-            t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
-                                       _mm_mul_sd(rinv, rinv));
-            t3            = _mm_sub_sd(t3,
-                                       _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
-                                                  _mm_add_sd(one,
-                                                             _mm_mul_sd(sk2_rinv, rinv))));
-            t1            = _mm_mul_sd(rinv,
-                                       _mm_add_sd(_mm_mul_sd(dlij, t1),
-                                                  _mm_add_sd(t2, t3)));
-
-            dadx2         = _mm_and_pd(t1, obc_mask1);
-
-            _mm_store_pd(dadx, dadx1);
-            dadx += 2;
-            _mm_store_pd(dadx, dadx2);
-            dadx += 2;
-        }
-        gmx_mm_update_1pot_pd(sum_ai, work+ii);
-
-    }
-
-    /* Parallel summations */
-    if (DOMAINDECOMP(cr))
-    {
-        dd_atom_sum_real(cr->dd, work);
-    }
-
-    if (gb_algorithm == egbHCT)
-    {
-        /* HCT */
-        for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
-        {
-            if (born->use[i] != 0)
-            {
-                rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
-                sum     = 1.0/rr - work[i];
-                min_rad = rr + doffset;
-                rad     = 1.0/sum;
-
-                born->bRad[i]   = rad > min_rad ? rad : min_rad;
-                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-            }
-        }
-
-        /* Extra communication required for DD */
-        if (DOMAINDECOMP(cr))
-        {
-            dd_atom_spread_real(cr->dd, born->bRad);
-            dd_atom_spread_real(cr->dd, fr->invsqrta);
-        }
-    }
-    else
-    {
-        /* OBC */
-        for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
-        {
-            if (born->use[i] != 0)
-            {
-                rr      = top->atomtypes.gb_radius[md->typeA[i]];
-                rr_inv2 = 1.0/rr;
-                rr      = rr-doffset;
-                rr_inv  = 1.0/rr;
-                sum     = rr * work[i];
-                sum2    = sum  * sum;
-                sum3    = sum2 * sum;
-
-                tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
-                born->bRad[i] = rr_inv - tsum*rr_inv2;
-                born->bRad[i] = 1.0 / born->bRad[i];
-
-                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-
-                tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
-                born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
-            }
-        }
-        /* Extra (local) communication required for DD */
-        if (DOMAINDECOMP(cr))
-        {
-            dd_atom_spread_real(cr->dd, born->bRad);
-            dd_atom_spread_real(cr->dd, fr->invsqrta);
-            dd_atom_spread_real(cr->dd, born->drobc);
-        }
-    }
-
-
-
-    return 0;
-}
-
-
-int
-calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
-                              double *x, double *f, double *fshift, double *shiftvec,
-                              int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
-{
-    int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
-    int           jnrA, jnrB;
-    int           j3A, j3B;
-    int        *  jjnr;
-
-    double        rbi, shX, shY, shZ;
-    double       *rb;
-
-    __m128d       ix, iy, iz;
-    __m128d       jx, jy, jz;
-    __m128d       fix, fiy, fiz;
-    __m128d       dx, dy, dz;
-    __m128d       tx, ty, tz;
-
-    __m128d       rbai, rbaj, f_gb, f_gb_ai;
-    __m128d       xmm1, xmm2, xmm3;
-
-    const __m128d two = _mm_set1_pd(2.0);
-
-    rb     = born->work;
-
-    jjnr   = nl->jjnr;
-
-    /* Loop to get the proper form for the Born radius term, sse style */
-    n0 = 0;
-    n1 = natoms;
-
-    if (gb_algorithm == egbSTILL)
-    {
-        for (i = n0; i < n1; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
-        }
-    }
-    else if (gb_algorithm == egbHCT)
-    {
-        for (i = n0; i < n1; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = rbi * rbi * dvda[i];
-        }
-    }
-    else if (gb_algorithm == egbOBC)
-    {
-        for (i = n0; i < n1; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
-        }
-    }
-
-    jz = _mm_setzero_pd();
-
-    n = j3A = j3B = 0;
-
-    for (i = 0; i < nl->nri; i++)
-    {
-        ii     = nl->iinr[i];
-        ii3    = ii*3;
-        is3    = 3*nl->shift[i];
-        shX    = shiftvec[is3];
-        shY    = shiftvec[is3+1];
-        shZ    = shiftvec[is3+2];
-        nj0    = nl->jindex[i];
-        nj1    = nl->jindex[i+1];
-
-        ix     = _mm_set1_pd(shX+x[ii3+0]);
-        iy     = _mm_set1_pd(shY+x[ii3+1]);
-        iz     = _mm_set1_pd(shZ+x[ii3+2]);
-
-        rbai   = _mm_load1_pd(rb+ii);
-        fix    = _mm_setzero_pd();
-        fiy    = _mm_setzero_pd();
-        fiz    = _mm_setzero_pd();
-
-
-        for (k = nj0; k < nj1-1; k += 2)
-        {
-            jnrA        = jjnr[k];
-            jnrB        = jjnr[k+1];
-
-            j3A         = 3*jnrA;
-            j3B         = 3*jnrB;
-
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
-
-            dx          = _mm_sub_pd(ix, jx);
-            dy          = _mm_sub_pd(iy, jy);
-            dz          = _mm_sub_pd(iz, jz);
-
-            GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
-
-            /* load chain rule terms for j1-4 */
-            f_gb        = _mm_load_pd(dadx);
-            dadx       += 2;
-            f_gb_ai     = _mm_load_pd(dadx);
-            dadx       += 2;
-
-            /* calculate scalar force */
-            f_gb    = _mm_mul_pd(f_gb, rbai);
-            f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
-            f_gb    = _mm_add_pd(f_gb, f_gb_ai);
-
-            tx     = _mm_mul_pd(f_gb, dx);
-            ty     = _mm_mul_pd(f_gb, dy);
-            tz     = _mm_mul_pd(f_gb, dz);
-
-            fix    = _mm_add_pd(fix, tx);
-            fiy    = _mm_add_pd(fiy, ty);
-            fiz    = _mm_add_pd(fiz, tz);
-
-            GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
-        }
-
-        /*deal with odd elements */
-        if (k < nj1)
-        {
-            jnrA        = jjnr[k];
-            j3A         = 3*jnrA;
-
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
-
-            dx          = _mm_sub_sd(ix, jx);
-            dy          = _mm_sub_sd(iy, jy);
-            dz          = _mm_sub_sd(iz, jz);
-
-            GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
-
-            /* load chain rule terms */
-            f_gb        = _mm_load_pd(dadx);
-            dadx       += 2;
-            f_gb_ai     = _mm_load_pd(dadx);
-            dadx       += 2;
-
-            /* calculate scalar force */
-            f_gb    = _mm_mul_sd(f_gb, rbai);
-            f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
-            f_gb    = _mm_add_sd(f_gb, f_gb_ai);
-
-            tx     = _mm_mul_sd(f_gb, dx);
-            ty     = _mm_mul_sd(f_gb, dy);
-            tz     = _mm_mul_sd(f_gb, dz);
-
-            fix    = _mm_add_sd(fix, tx);
-            fiy    = _mm_add_sd(fiy, ty);
-            fiz    = _mm_add_sd(fiz, tz);
-
-            GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
-        }
-
-        /* fix/fiy/fiz now contain four partial force terms, that all should be
-         * added to the i particle forces and shift forces.
-         */
-        gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
-    }
-
-    return 0;
-}
-
-#else
-/* keep compiler happy */
-int genborn_sse2_dummy;
-
-#endif /* SSE2 intrinsics available */
diff --git a/src/gromacs/mdlib/genborn_sse2_double.h b/src/gromacs/mdlib/genborn_sse2_double.h
deleted file mode 100644 (file)
index 0bf4ea9..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _genborn_sse2_double_h
-#define _genborn_sse2_double_h
-
-#include "gromacs/legacyheaders/typedefs.h"
-
-int
-calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
-                              double *x, t_nblist *nl, gmx_genborn_t *born);
-
-int
-calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f,
-                              double *fshift, double *shift_vec, int gb_algorithm,
-                              gmx_genborn_t *born, t_mdatoms *md);
-
-int
-calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
-                                double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm);
-
-#endif /* _genborn_sse2_double_h */
diff --git a/src/gromacs/mdlib/genborn_sse2_single.c b/src/gromacs/mdlib/genborn_sse2_single.c
deleted file mode 100644 (file)
index accbb6e..0000000
+++ /dev/null
@@ -1,1510 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include <math.h>
-#include <string.h>
-
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/fileio/pdbio.h"
-#include "gromacs/legacyheaders/genborn.h"
-#include "gromacs/legacyheaders/names.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/smalloc.h"
-
-
-/* Only compile this file if SSE intrinsics are available */
-#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
-
-#include "genborn_sse2_single.h"
-
-#include <emmintrin.h>
-#include <gmx_sse2_single.h>
-
-
-int
-calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
-                              int natoms, gmx_localtop_t *top,
-                              float *x, t_nblist *nl,
-                              gmx_genborn_t *born)
-{
-    int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
-    int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
-    int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
-    int          shift;
-    int         *mdtype;
-    real         shX, shY, shZ;
-    int         *jjnr;
-    real        *shiftvec;
-
-    float        gpi_ai, gpi2;
-    float        factor;
-    float       *gb_radius;
-    float       *vsolv;
-    float       *work;
-    float       *dadx;
-
-    __m128       ix, iy, iz;
-    __m128       jx, jy, jz;
-    __m128       dx, dy, dz;
-    __m128       tx, ty, tz;
-    __m128       jxB, jyB, jzB;
-    __m128       dxB, dyB, dzB;
-    __m128       txB, tyB, tzB;
-    __m128       rsq, rinv, rinv2, rinv4, rinv6;
-    __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
-    __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
-    __m128       ratioB, rajB, vajB, rvdwB;
-    __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
-    __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
-    __m128       mask, icf4, icf6, mask_cmp;
-    __m128       icf4B, icf6B, mask_cmpB;
-
-    __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
-    __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
-    __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
-
-    const __m128 half   = _mm_set1_ps(0.5f);
-    const __m128 three  = _mm_set1_ps(3.0f);
-    const __m128 one    = _mm_set1_ps(1.0f);
-    const __m128 two    = _mm_set1_ps(2.0f);
-    const __m128 zero   = _mm_set1_ps(0.0f);
-    const __m128 four   = _mm_set1_ps(4.0f);
-
-    const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
-    const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
-    const __m128 still_p4     = _mm_set1_ps(STILL_P4);
-
-    factor  = 0.5 * ONE_4PI_EPS0;
-
-    gb_radius = born->gb_radius;
-    vsolv     = born->vsolv;
-    work      = born->gpol_still_work;
-    jjnr      = nl->jjnr;
-    shiftvec  = fr->shift_vec[0];
-    dadx      = fr->dadx;
-
-    jnrA = jnrB = jnrC = jnrD = 0;
-    jx   = _mm_setzero_ps();
-    jy   = _mm_setzero_ps();
-    jz   = _mm_setzero_ps();
-
-    n = 0;
-
-    for (i = 0; i < natoms; i++)
-    {
-        work[i] = 0;
-    }
-
-    for (i = 0; i < nl->nri; i++)
-    {
-        ii     = nl->iinr[i];
-        ii3    = ii*3;
-        is3    = 3*nl->shift[i];
-        shX    = shiftvec[is3];
-        shY    = shiftvec[is3+1];
-        shZ    = shiftvec[is3+2];
-        nj0    = nl->jindex[i];
-        nj1    = nl->jindex[i+1];
-
-        ix     = _mm_set1_ps(shX+x[ii3+0]);
-        iy     = _mm_set1_ps(shY+x[ii3+1]);
-        iz     = _mm_set1_ps(shZ+x[ii3+2]);
-
-        offset = (nj1-nj0)%4;
-
-        /* Polarization energy for atom ai */
-        gpi    = _mm_setzero_ps();
-
-        rai     = _mm_load1_ps(gb_radius+ii);
-        prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
-
-        for (k = nj0; k < nj1-4-offset; k += 8)
-        {
-            jnrA        = jjnr[k];
-            jnrB        = jjnr[k+1];
-            jnrC        = jjnr[k+2];
-            jnrD        = jjnr[k+3];
-            jnrE        = jjnr[k+4];
-            jnrF        = jjnr[k+5];
-            jnrG        = jjnr[k+6];
-            jnrH        = jjnr[k+7];
-
-            j3A         = 3*jnrA;
-            j3B         = 3*jnrB;
-            j3C         = 3*jnrC;
-            j3D         = 3*jnrD;
-            j3E         = 3*jnrE;
-            j3F         = 3*jnrF;
-            j3G         = 3*jnrG;
-            j3H         = 3*jnrH;
-
-            GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
-            GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
-
-            GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
-            GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
-            GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
-            GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
-
-            dx          = _mm_sub_ps(ix, jx);
-            dy          = _mm_sub_ps(iy, jy);
-            dz          = _mm_sub_ps(iz, jz);
-            dxB         = _mm_sub_ps(ix, jxB);
-            dyB         = _mm_sub_ps(iy, jyB);
-            dzB         = _mm_sub_ps(iz, jzB);
-
-            rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
-            rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
-            rinv        = gmx_mm_invsqrt_ps(rsq);
-            rinvB       = gmx_mm_invsqrt_ps(rsqB);
-            rinv2       = _mm_mul_ps(rinv, rinv);
-            rinv2B      = _mm_mul_ps(rinvB, rinvB);
-            rinv4       = _mm_mul_ps(rinv2, rinv2);
-            rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
-            rinv6       = _mm_mul_ps(rinv4, rinv2);
-            rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
-
-            rvdw        = _mm_add_ps(rai, raj);
-            rvdwB       = _mm_add_ps(rai, rajB);
-            ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
-            ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
-
-            mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
-            mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
-
-            /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
-            if (0 == _mm_movemask_ps(mask_cmp) )
-            {
-                /* if ratio>still_p5inv for ALL elements */
-                ccf         = one;
-                dccf        = _mm_setzero_ps();
-            }
-            else
-            {
-                ratio       = _mm_min_ps(ratio, still_p5inv);
-                theta       = _mm_mul_ps(ratio, still_pip5);
-                gmx_mm_sincos_ps(theta, &sinq, &cosq);
-                term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
-                ccf         = _mm_mul_ps(term, term);
-                dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
-                                         _mm_mul_ps(sinq, theta));
-            }
-            if (0 == _mm_movemask_ps(mask_cmpB) )
-            {
-                /* if ratio>still_p5inv for ALL elements */
-                ccfB        = one;
-                dccfB       = _mm_setzero_ps();
-            }
-            else
-            {
-                ratioB      = _mm_min_ps(ratioB, still_p5inv);
-                thetaB      = _mm_mul_ps(ratioB, still_pip5);
-                gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
-                termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
-                ccfB        = _mm_mul_ps(termB, termB);
-                dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
-                                         _mm_mul_ps(sinqB, thetaB));
-            }
-
-            prod        = _mm_mul_ps(still_p4, vaj);
-            prodB       = _mm_mul_ps(still_p4, vajB);
-            icf4        = _mm_mul_ps(ccf, rinv4);
-            icf4B       = _mm_mul_ps(ccfB, rinv4B);
-            icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
-            icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
-
-            GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
-            GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
-
-            gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
-
-            _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
-            dadx += 4;
-        }
-
-        for (; k < nj1-offset; k += 4)
-        {
-            jnrA        = jjnr[k];
-            jnrB        = jjnr[k+1];
-            jnrC        = jjnr[k+2];
-            jnrD        = jjnr[k+3];
-
-            j3A         = 3*jnrA;
-            j3B         = 3*jnrB;
-            j3C         = 3*jnrC;
-            j3D         = 3*jnrD;
-
-            GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
-
-            GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
-            GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
-
-            dx          = _mm_sub_ps(ix, jx);
-            dy          = _mm_sub_ps(iy, jy);
-            dz          = _mm_sub_ps(iz, jz);
-
-            rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
-            rinv        = gmx_mm_invsqrt_ps(rsq);
-            rinv2       = _mm_mul_ps(rinv, rinv);
-            rinv4       = _mm_mul_ps(rinv2, rinv2);
-            rinv6       = _mm_mul_ps(rinv4, rinv2);
-
-            rvdw        = _mm_add_ps(rai, raj);
-            ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
-
-            mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
-
-            /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
-            if (0 == _mm_movemask_ps(mask_cmp))
-            {
-                /* if ratio>still_p5inv for ALL elements */
-                ccf         = one;
-                dccf        = _mm_setzero_ps();
-            }
-            else
-            {
-                ratio       = _mm_min_ps(ratio, still_p5inv);
-                theta       = _mm_mul_ps(ratio, still_pip5);
-                gmx_mm_sincos_ps(theta, &sinq, &cosq);
-                term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
-                ccf         = _mm_mul_ps(term, term);
-                dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
-                                         _mm_mul_ps(sinq, theta));
-            }
-
-            prod        = _mm_mul_ps(still_p4, vaj);
-            icf4        = _mm_mul_ps(ccf, rinv4);
-            icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
-
-            GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
-
-            gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
-
-            _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
-            dadx += 4;
-        }
-
-        if (offset != 0)
-        {
-            if (offset == 1)
-            {
-                jnrA        = jjnr[k];
-                j3A         = 3*jnrA;
-                GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
-                GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
-                GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
-                mask        = mask1;
-            }
-            else if (offset == 2)
-            {
-                jnrA        = jjnr[k];
-                jnrB        = jjnr[k+1];
-                j3A         = 3*jnrA;
-                j3B         = 3*jnrB;
-                GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
-                GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
-                GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
-                mask        = mask2;
-            }
-            else
-            {
-                /* offset must be 3 */
-                jnrA        = jjnr[k];
-                jnrB        = jjnr[k+1];
-                jnrC        = jjnr[k+2];
-                j3A         = 3*jnrA;
-                j3B         = 3*jnrB;
-                j3C         = 3*jnrC;
-                GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
-                GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
-                GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
-                mask        = mask3;
-            }
-
-            dx          = _mm_sub_ps(ix, jx);
-            dy          = _mm_sub_ps(iy, jy);
-            dz          = _mm_sub_ps(iz, jz);
-
-            rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
-            rinv        = gmx_mm_invsqrt_ps(rsq);
-            rinv2       = _mm_mul_ps(rinv, rinv);
-            rinv4       = _mm_mul_ps(rinv2, rinv2);
-            rinv6       = _mm_mul_ps(rinv4, rinv2);
-
-            rvdw        = _mm_add_ps(rai, raj);
-            ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
-
-            mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
-
-            if (0 == _mm_movemask_ps(mask_cmp))
-            {
-                /* if ratio>still_p5inv for ALL elements */
-                ccf         = one;
-                dccf        = _mm_setzero_ps();
-            }
-            else
-            {
-                ratio       = _mm_min_ps(ratio, still_p5inv);
-                theta       = _mm_mul_ps(ratio, still_pip5);
-                gmx_mm_sincos_ps(theta, &sinq, &cosq);
-                term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
-                ccf         = _mm_mul_ps(term, term);
-                dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
-                                         _mm_mul_ps(sinq, theta));
-            }
-
-            prod        = _mm_mul_ps(still_p4, vaj);
-            icf4        = _mm_mul_ps(ccf, rinv4);
-            icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
-
-            gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
-
-            _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
-            dadx += 4;
-            _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
-            dadx += 4;
-
-            tmp = _mm_mul_ps(prod_ai, icf4);
-
-            if (offset == 1)
-            {
-                GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
-            }
-            else if (offset == 2)
-            {
-                GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
-            }
-            else
-            {
-                /* offset must be 3 */
-                GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
-            }
-        }
-        GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
-    }
-
-    /* Sum up the polarization energy from other nodes */
-    if (DOMAINDECOMP(cr))
-    {
-        dd_atom_sum_real(cr->dd, work);
-    }
-
-    /* Compute the radii */
-    for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
-    {
-        if (born->use[i] != 0)
-        {
-            gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
-            gpi2             = gpi_ai * gpi_ai;
-            born->bRad[i]    = factor*gmx_invsqrt(gpi2);
-            fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
-        }
-    }
-
-    /* Extra (local) communication required for DD */
-    if (DOMAINDECOMP(cr))
-    {
-        dd_atom_spread_real(cr->dd, born->bRad);
-        dd_atom_spread_real(cr->dd, fr->invsqrta);
-    }
-
-    return 0;
-}
-
-
-int
-calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
-                                float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
-{
-    int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
-    int          jnrA, jnrB, jnrC, jnrD;
-    int          j3A, j3B, j3C, j3D;
-    int          jnrE, jnrF, jnrG, jnrH;
-    int          j3E, j3F, j3G, j3H;
-    float        shX, shY, shZ;
-    float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
-    float        sum_ai2, sum_ai3, tsum, tchain, doffset;
-    float       *obc_param;
-    float       *gb_radius;
-    float       *work;
-    int       *  jjnr;
-    float       *dadx;
-    float       *shiftvec;
-    float        min_rad, rad;
-
-    __m128       ix, iy, iz, jx, jy, jz;
-    __m128       dx, dy, dz, t1, t2, t3, t4;
-    __m128       rsq, rinv, r;
-    __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
-    __m128       uij, lij2, uij2, lij3, uij3, diff2;
-    __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
-    __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
-    __m128       dadx1, dadx2;
-    __m128       logterm;
-    __m128       mask;
-    __m128       obc_mask1, obc_mask2, obc_mask3;
-    __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
-    __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
-    __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
-    __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
-    __m128       lij_invB, sk2_invB, prodB;
-    __m128       sk_ajB, sk2_ajB, sk2_rinvB;
-    __m128       dadx1B, dadx2B;
-    __m128       logtermB;
-    __m128       obc_mask1B, obc_mask2B, obc_mask3B;
-
-    __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
-    __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
-    __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
-
-    __m128       oneeighth   = _mm_set1_ps(0.125);
-    __m128       onefourth   = _mm_set1_ps(0.25);
-
-    const __m128 half  = _mm_set1_ps(0.5f);
-    const __m128 three = _mm_set1_ps(3.0f);
-    const __m128 one   = _mm_set1_ps(1.0f);
-    const __m128 two   = _mm_set1_ps(2.0f);
-    const __m128 zero  = _mm_set1_ps(0.0f);
-    const __m128 neg   = _mm_set1_ps(-1.0f);
-
-    /* Set the dielectric offset */
-    doffset   = born->gb_doffset;
-    gb_radius = born->gb_radius;
-    obc_param = born->param;
-    work      = born->gpol_hct_work;
-    jjnr      = nl->jjnr;
-    dadx      = fr->dadx;
-    shiftvec  = fr->shift_vec[0];
-
-    jx        = _mm_setzero_ps();
-    jy        = _mm_setzero_ps();
-    jz        = _mm_setzero_ps();
-
-    jnrA = jnrB = jnrC = jnrD = 0;
-
-    for (i = 0; i < born->nr; i++)
-    {
-        work[i] = 0;
-    }
-
-    for (i = 0; i < nl->nri; i++)
-    {
-        ii     = nl->iinr[i];
-        ii3    = ii*3;
-        is3    = 3*nl->shift[i];
-        shX    = shiftvec[is3];
-        shY    = shiftvec[is3+1];
-        shZ    = shiftvec[is3+2];
-        nj0    = nl->jindex[i];
-        nj1    = nl->jindex[i+1];
-
-        ix     = _mm_set1_ps(shX+x[ii3+0]);
-        iy     = _mm_set1_ps(shY+x[ii3+1]);
-        iz     = _mm_set1_ps(shZ+x[ii3+2]);
-
-        offset = (nj1-nj0)%4;
-
-        rai     = _mm_load1_ps(gb_radius+ii);
-        rai_inv = gmx_mm_inv_ps(rai);
-
-        sum_ai = _mm_setzero_ps();
-
-        sk_ai  = _mm_load1_ps(born->param+ii);
-        sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
-
-        for (k = nj0; k < nj1-4-offset; k += 8)
-        {
-            jnrA        = jjnr[k];
-            jnrB        = jjnr[k+1];
-            jnrC        = jjnr[k+2];
-            jnrD        = jjnr[k+3];
-            jnrE        = jjnr[k+4];
-            jnrF        = jjnr[k+5];
-            jnrG        = jjnr[k+6];
-            jnrH        = jjnr[k+7];
-
-            j3A         = 3*jnrA;
-            j3B         = 3*jnrB;
-            j3C         = 3*jnrC;
-            j3D         = 3*jnrD;
-            j3E         = 3*jnrE;
-            j3F         = 3*jnrF;
-            j3G         = 3*jnrG;
-            j3H         = 3*jnrH;
-
-            GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
-            GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
-            GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
-            GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
-            GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
-            GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
-
-            dx    = _mm_sub_ps(ix, jx);
-            dy    = _mm_sub_ps(iy, jy);
-            dz    = _mm_sub_ps(iz, jz);
-            dxB   = _mm_sub_ps(ix, jxB);
-            dyB   = _mm_sub_ps(iy, jyB);
-            dzB   = _mm_sub_ps(iz, jzB);
-
-            rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
-            rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
-
-            rinv        = gmx_mm_invsqrt_ps(rsq);
-            r           = _mm_mul_ps(rsq, rinv);
-            rinvB       = gmx_mm_invsqrt_ps(rsqB);
-            rB          = _mm_mul_ps(rsqB, rinvB);
-
-            /* Compute raj_inv aj1-4 */
-            raj_inv     = gmx_mm_inv_ps(raj);
-            raj_invB    = gmx_mm_inv_ps(rajB);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1            = _mm_add_ps(r, sk_aj);
-            t2            = _mm_sub_ps(r, sk_aj);
-            t3            = _mm_sub_ps(sk_aj, r);
-            t1B           = _mm_add_ps(rB, sk_ajB);
-            t2B           = _mm_sub_ps(rB, sk_ajB);
-            t3B           = _mm_sub_ps(sk_ajB, rB);
-            obc_mask1     = _mm_cmplt_ps(rai, t1);
-            obc_mask2     = _mm_cmplt_ps(rai, t2);
-            obc_mask3     = _mm_cmplt_ps(rai, t3);
-            obc_mask1B    = _mm_cmplt_ps(rai, t1B);
-            obc_mask2B    = _mm_cmplt_ps(rai, t2B);
-            obc_mask3B    = _mm_cmplt_ps(rai, t3B);
-
-            uij           = gmx_mm_inv_ps(t1);
-            lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
-                                         _mm_andnot_ps(obc_mask2, rai_inv));
-            dlij          = _mm_and_ps(one, obc_mask2);
-            uij2          = _mm_mul_ps(uij, uij);
-            uij3          = _mm_mul_ps(uij2, uij);
-            lij2          = _mm_mul_ps(lij, lij);
-            lij3          = _mm_mul_ps(lij2, lij);
-
-            uijB          = gmx_mm_inv_ps(t1B);
-            lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
-                                         _mm_andnot_ps(obc_mask2B, rai_inv));
-            dlijB         = _mm_and_ps(one, obc_mask2B);
-            uij2B         = _mm_mul_ps(uijB, uijB);
-            uij3B         = _mm_mul_ps(uij2B, uijB);
-            lij2B         = _mm_mul_ps(lijB, lijB);
-            lij3B         = _mm_mul_ps(lij2B, lijB);
-
-            diff2         = _mm_sub_ps(uij2, lij2);
-            lij_inv       = gmx_mm_invsqrt_ps(lij2);
-            sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
-            sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
-            prod          = _mm_mul_ps(onefourth, sk2_rinv);
-
-            diff2B        = _mm_sub_ps(uij2B, lij2B);
-            lij_invB      = gmx_mm_invsqrt_ps(lij2B);
-            sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
-            sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
-            prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
-
-            logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
-            logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
-
-            t1            = _mm_sub_ps(lij, uij);
-            t2            = _mm_mul_ps(diff2,
-                                       _mm_sub_ps(_mm_mul_ps(onefourth, r),
-                                                  prod));
-            t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
-            t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
-            t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
-            t4            = _mm_and_ps(t4, obc_mask3);
-            t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-
-            t1B           = _mm_sub_ps(lijB, uijB);
-            t2B           = _mm_mul_ps(diff2B,
-                                       _mm_sub_ps(_mm_mul_ps(onefourth, rB),
-                                                  prodB));
-            t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
-            t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
-            t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
-            t4B           = _mm_and_ps(t4B, obc_mask3B);
-            t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
-
-            sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
-
-            t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
-                                       _mm_mul_ps(prod, lij3));
-            t1            = _mm_sub_ps(t1,
-                                       _mm_mul_ps(onefourth,
-                                                  _mm_add_ps(_mm_mul_ps(lij, rinv),
-                                                             _mm_mul_ps(lij3, r))));
-            t2            = _mm_mul_ps(onefourth,
-                                       _mm_add_ps(_mm_mul_ps(uij, rinv),
-                                                  _mm_mul_ps(uij3, r)));
-            t2            = _mm_sub_ps(t2,
-                                       _mm_add_ps(_mm_mul_ps(half, uij2),
-                                                  _mm_mul_ps(prod, uij3)));
-            t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
-                                       _mm_mul_ps(rinv, rinv));
-            t3            = _mm_sub_ps(t3,
-                                       _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
-                                                  _mm_add_ps(one,
-                                                             _mm_mul_ps(sk2_rinv, rinv))));
-            t1            = _mm_mul_ps(rinv,
-                                       _mm_add_ps(_mm_mul_ps(dlij, t1),
-                                                  _mm_add_ps(t2, t3)));
-
-
-
-            t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
-                                       _mm_mul_ps(prodB, lij3B));
-            t1B           = _mm_sub_ps(t1B,
-                                       _mm_mul_ps(onefourth,
-                                                  _mm_add_ps(_mm_mul_ps(lijB, rinvB),
-                                                             _mm_mul_ps(lij3B, rB))));
-            t2B           = _mm_mul_ps(onefourth,
-                                       _mm_add_ps(_mm_mul_ps(uijB, rinvB),
-                                                  _mm_mul_ps(uij3B, rB)));
-            t2B           = _mm_sub_ps(t2B,
-                                       _mm_add_ps(_mm_mul_ps(half, uij2B),
-                                                  _mm_mul_ps(prodB, uij3B)));
-            t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
-                                       _mm_mul_ps(rinvB, rinvB));
-            t3B           = _mm_sub_ps(t3B,
-                                       _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
-                                                  _mm_add_ps(one,
-                                                             _mm_mul_ps(sk2_rinvB, rinvB))));
-            t1B           = _mm_mul_ps(rinvB,
-                                       _mm_add_ps(_mm_mul_ps(dlijB, t1B),
-                                                  _mm_add_ps(t2B, t3B)));
-
-            dadx1         = _mm_and_ps(t1, obc_mask1);
-            dadx1B        = _mm_and_ps(t1B, obc_mask1B);
-
-
-            /* Evaluate influence of atom ai -> aj */
-            t1            = _mm_add_ps(r, sk_ai);
-            t2            = _mm_sub_ps(r, sk_ai);
-            t3            = _mm_sub_ps(sk_ai, r);
-            t1B           = _mm_add_ps(rB, sk_ai);
-            t2B           = _mm_sub_ps(rB, sk_ai);
-            t3B           = _mm_sub_ps(sk_ai, rB);
-            obc_mask1     = _mm_cmplt_ps(raj, t1);
-            obc_mask2     = _mm_cmplt_ps(raj, t2);
-            obc_mask3     = _mm_cmplt_ps(raj, t3);
-            obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
-            obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
-            obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
-
-            uij           = gmx_mm_inv_ps(t1);
-            lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
-                                         _mm_andnot_ps(obc_mask2, raj_inv));
-            dlij          = _mm_and_ps(one, obc_mask2);
-            uij2          = _mm_mul_ps(uij, uij);
-            uij3          = _mm_mul_ps(uij2, uij);
-            lij2          = _mm_mul_ps(lij, lij);
-            lij3          = _mm_mul_ps(lij2, lij);
-
-            uijB          = gmx_mm_inv_ps(t1B);
-            lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
-                                         _mm_andnot_ps(obc_mask2B, raj_invB));
-            dlijB         = _mm_and_ps(one, obc_mask2B);
-            uij2B         = _mm_mul_ps(uijB, uijB);
-            uij3B         = _mm_mul_ps(uij2B, uijB);
-            lij2B         = _mm_mul_ps(lijB, lijB);
-            lij3B         = _mm_mul_ps(lij2B, lijB);
-
-            diff2         = _mm_sub_ps(uij2, lij2);
-            lij_inv       = gmx_mm_invsqrt_ps(lij2);
-            sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
-            prod          = _mm_mul_ps(onefourth, sk2_rinv);
-
-            diff2B        = _mm_sub_ps(uij2B, lij2B);
-            lij_invB      = gmx_mm_invsqrt_ps(lij2B);
-            sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
-            prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
-
-            logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
-            logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
-
-            t1            = _mm_sub_ps(lij, uij);
-            t2            = _mm_mul_ps(diff2,
-                                       _mm_sub_ps(_mm_mul_ps(onefourth, r),
-                                                  prod));
-            t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
-            t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
-            t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
-            t4            = _mm_and_ps(t4, obc_mask3);
-            t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-
-            t1B           = _mm_sub_ps(lijB, uijB);
-            t2B           = _mm_mul_ps(diff2B,
-                                       _mm_sub_ps(_mm_mul_ps(onefourth, rB),
-                                                  prodB));
-            t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
-            t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
-            t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
-            t4B           = _mm_and_ps(t4B, obc_mask3B);
-            t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
-
-            GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
-            GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
-
-            t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
-                                       _mm_mul_ps(prod, lij3));
-            t1            = _mm_sub_ps(t1,
-                                       _mm_mul_ps(onefourth,
-                                                  _mm_add_ps(_mm_mul_ps(lij, rinv),
-                                                             _mm_mul_ps(lij3, r))));
-            t2            = _mm_mul_ps(onefourth,
-                                       _mm_add_ps(_mm_mul_ps(uij, rinv),
-                                                  _mm_mul_ps(uij3, r)));
-            t2            = _mm_sub_ps(t2,
-                                       _mm_add_ps(_mm_mul_ps(half, uij2),
-                                                  _mm_mul_ps(prod, uij3)));
-            t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
-                                       _mm_mul_ps(rinv, rinv));
-            t3            = _mm_sub_ps(t3,
-                                       _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
-                                                  _mm_add_ps(one,
-                                                             _mm_mul_ps(sk2_rinv, rinv))));
-            t1            = _mm_mul_ps(rinv,
-                                       _mm_add_ps(_mm_mul_ps(dlij, t1),
-                                                  _mm_add_ps(t2, t3)));
-
-
-            t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
-                                       _mm_mul_ps(prodB, lij3B));
-            t1B           = _mm_sub_ps(t1B,
-                                       _mm_mul_ps(onefourth,
-                                                  _mm_add_ps(_mm_mul_ps(lijB, rinvB),
-                                                             _mm_mul_ps(lij3B, rB))));
-            t2B           = _mm_mul_ps(onefourth,
-                                       _mm_add_ps(_mm_mul_ps(uijB, rinvB),
-                                                  _mm_mul_ps(uij3B, rB)));
-            t2B           = _mm_sub_ps(t2B,
-                                       _mm_add_ps(_mm_mul_ps(half, uij2B),
-                                                  _mm_mul_ps(prodB, uij3B)));
-            t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
-                                       _mm_mul_ps(rinvB, rinvB));
-            t3B           = _mm_sub_ps(t3B,
-                                       _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
-                                                  _mm_add_ps(one,
-                                                             _mm_mul_ps(sk2_rinvB, rinvB))));
-            t1B           = _mm_mul_ps(rinvB,
-                                       _mm_add_ps(_mm_mul_ps(dlijB, t1B),
-                                                  _mm_add_ps(t2B, t3B)));
-
-
-            dadx2         = _mm_and_ps(t1, obc_mask1);
-            dadx2B        = _mm_and_ps(t1B, obc_mask1B);
-
-            _mm_store_ps(dadx, dadx1);
-            dadx += 4;
-            _mm_store_ps(dadx, dadx2);
-            dadx += 4;
-            _mm_store_ps(dadx, dadx1B);
-            dadx += 4;
-            _mm_store_ps(dadx, dadx2B);
-            dadx += 4;
-
-        } /* end normal inner loop */
-
-        for (; k < nj1-offset; k += 4)
-        {
-            jnrA        = jjnr[k];
-            jnrB        = jjnr[k+1];
-            jnrC        = jjnr[k+2];
-            jnrD        = jjnr[k+3];
-
-            j3A         = 3*jnrA;
-            j3B         = 3*jnrB;
-            j3C         = 3*jnrC;
-            j3D         = 3*jnrD;
-
-            GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
-            GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
-            GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
-
-            dx    = _mm_sub_ps(ix, jx);
-            dy    = _mm_sub_ps(iy, jy);
-            dz    = _mm_sub_ps(iz, jz);
-
-            rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
-
-            rinv        = gmx_mm_invsqrt_ps(rsq);
-            r           = _mm_mul_ps(rsq, rinv);
-
-            /* Compute raj_inv aj1-4 */
-            raj_inv     = gmx_mm_inv_ps(raj);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1            = _mm_add_ps(r, sk_aj);
-            obc_mask1     = _mm_cmplt_ps(rai, t1);
-
-            if (_mm_movemask_ps(obc_mask1))
-            {
-                /* If any of the elements has rai<dr+sk, this is executed */
-                t2            = _mm_sub_ps(r, sk_aj);
-                t3            = _mm_sub_ps(sk_aj, r);
-
-                obc_mask2     = _mm_cmplt_ps(rai, t2);
-                obc_mask3     = _mm_cmplt_ps(rai, t3);
-
-                uij           = gmx_mm_inv_ps(t1);
-                lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
-                                             _mm_andnot_ps(obc_mask2, rai_inv));
-                dlij          = _mm_and_ps(one, obc_mask2);
-                uij2          = _mm_mul_ps(uij, uij);
-                uij3          = _mm_mul_ps(uij2, uij);
-                lij2          = _mm_mul_ps(lij, lij);
-                lij3          = _mm_mul_ps(lij2, lij);
-                diff2         = _mm_sub_ps(uij2, lij2);
-                lij_inv       = gmx_mm_invsqrt_ps(lij2);
-                sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
-                sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
-                prod          = _mm_mul_ps(onefourth, sk2_rinv);
-                logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
-                t1            = _mm_sub_ps(lij, uij);
-                t2            = _mm_mul_ps(diff2,
-                                           _mm_sub_ps(_mm_mul_ps(onefourth, r),
-                                                      prod));
-                t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
-                t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
-                t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
-                t4            = _mm_and_ps(t4, obc_mask3);
-                t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-                sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
-                t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
-                                           _mm_mul_ps(prod, lij3));
-                t1            = _mm_sub_ps(t1,
-                                           _mm_mul_ps(onefourth,
-                                                      _mm_add_ps(_mm_mul_ps(lij, rinv),
-                                                                 _mm_mul_ps(lij3, r))));
-                t2            = _mm_mul_ps(onefourth,
-                                           _mm_add_ps(_mm_mul_ps(uij, rinv),
-                                                      _mm_mul_ps(uij3, r)));
-                t2            = _mm_sub_ps(t2,
-                                           _mm_add_ps(_mm_mul_ps(half, uij2),
-                                                      _mm_mul_ps(prod, uij3)));
-                t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
-                                           _mm_mul_ps(rinv, rinv));
-                t3            = _mm_sub_ps(t3,
-                                           _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
-                                                      _mm_add_ps(one,
-                                                                 _mm_mul_ps(sk2_rinv, rinv))));
-                t1            = _mm_mul_ps(rinv,
-                                           _mm_add_ps(_mm_mul_ps(dlij, t1),
-                                                      _mm_add_ps(t2, t3)));
-
-                dadx1         = _mm_and_ps(t1, obc_mask1);
-            }
-            else
-            {
-                dadx1         = _mm_setzero_ps();
-            }
-
-            /* Evaluate influence of atom ai -> aj */
-            t1            = _mm_add_ps(r, sk_ai);
-            obc_mask1     = _mm_cmplt_ps(raj, t1);
-
-            if (_mm_movemask_ps(obc_mask1))
-            {
-                t2            = _mm_sub_ps(r, sk_ai);
-                t3            = _mm_sub_ps(sk_ai, r);
-                obc_mask2     = _mm_cmplt_ps(raj, t2);
-                obc_mask3     = _mm_cmplt_ps(raj, t3);
-
-                uij           = gmx_mm_inv_ps(t1);
-                lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
-                                             _mm_andnot_ps(obc_mask2, raj_inv));
-                dlij          = _mm_and_ps(one, obc_mask2);
-                uij2          = _mm_mul_ps(uij, uij);
-                uij3          = _mm_mul_ps(uij2, uij);
-                lij2          = _mm_mul_ps(lij, lij);
-                lij3          = _mm_mul_ps(lij2, lij);
-                diff2         = _mm_sub_ps(uij2, lij2);
-                lij_inv       = gmx_mm_invsqrt_ps(lij2);
-                sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
-                prod          = _mm_mul_ps(onefourth, sk2_rinv);
-                logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
-                t1            = _mm_sub_ps(lij, uij);
-                t2            = _mm_mul_ps(diff2,
-                                           _mm_sub_ps(_mm_mul_ps(onefourth, r),
-                                                      prod));
-                t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
-                t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
-                t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
-                t4            = _mm_and_ps(t4, obc_mask3);
-                t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-
-                GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
-
-                t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
-                                           _mm_mul_ps(prod, lij3));
-                t1            = _mm_sub_ps(t1,
-                                           _mm_mul_ps(onefourth,
-                                                      _mm_add_ps(_mm_mul_ps(lij, rinv),
-                                                                 _mm_mul_ps(lij3, r))));
-                t2            = _mm_mul_ps(onefourth,
-                                           _mm_add_ps(_mm_mul_ps(uij, rinv),
-                                                      _mm_mul_ps(uij3, r)));
-                t2            = _mm_sub_ps(t2,
-                                           _mm_add_ps(_mm_mul_ps(half, uij2),
-                                                      _mm_mul_ps(prod, uij3)));
-                t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
-                                           _mm_mul_ps(rinv, rinv));
-                t3            = _mm_sub_ps(t3,
-                                           _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
-                                                      _mm_add_ps(one,
-                                                                 _mm_mul_ps(sk2_rinv, rinv))));
-                t1            = _mm_mul_ps(rinv,
-                                           _mm_add_ps(_mm_mul_ps(dlij, t1),
-                                                      _mm_add_ps(t2, t3)));
-                dadx2         = _mm_and_ps(t1, obc_mask1);
-            }
-            else
-            {
-                dadx2         = _mm_setzero_ps();
-            }
-
-            _mm_store_ps(dadx, dadx1);
-            dadx += 4;
-            _mm_store_ps(dadx, dadx2);
-            dadx += 4;
-        } /* end normal inner loop */
-
-        if (offset != 0)
-        {
-            if (offset == 1)
-            {
-                jnrA        = jjnr[k];
-                j3A         = 3*jnrA;
-                GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
-                GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
-                GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
-                mask        = mask1;
-            }
-            else if (offset == 2)
-            {
-                jnrA        = jjnr[k];
-                jnrB        = jjnr[k+1];
-                j3A         = 3*jnrA;
-                j3B         = 3*jnrB;
-                GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
-                GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
-                GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
-                mask        = mask2;
-            }
-            else
-            {
-                /* offset must be 3 */
-                jnrA        = jjnr[k];
-                jnrB        = jjnr[k+1];
-                jnrC        = jjnr[k+2];
-                j3A         = 3*jnrA;
-                j3B         = 3*jnrB;
-                j3C         = 3*jnrC;
-                GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
-                GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
-                GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
-                mask        = mask3;
-            }
-
-            dx    = _mm_sub_ps(ix, jx);
-            dy    = _mm_sub_ps(iy, jy);
-            dz    = _mm_sub_ps(iz, jz);
-
-            rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
-
-            rinv        = gmx_mm_invsqrt_ps(rsq);
-            r           = _mm_mul_ps(rsq, rinv);
-
-            /* Compute raj_inv aj1-4 */
-            raj_inv     = gmx_mm_inv_ps(raj);
-
-            /* Evaluate influence of atom aj -> ai */
-            t1            = _mm_add_ps(r, sk_aj);
-            obc_mask1     = _mm_cmplt_ps(rai, t1);
-            obc_mask1     = _mm_and_ps(obc_mask1, mask);
-
-            if (_mm_movemask_ps(obc_mask1))
-            {
-                t2            = _mm_sub_ps(r, sk_aj);
-                t3            = _mm_sub_ps(sk_aj, r);
-                obc_mask2     = _mm_cmplt_ps(rai, t2);
-                obc_mask3     = _mm_cmplt_ps(rai, t3);
-
-                uij           = gmx_mm_inv_ps(t1);
-                lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
-                                             _mm_andnot_ps(obc_mask2, rai_inv));
-                dlij           = _mm_and_ps(one, obc_mask2);
-                uij2           = _mm_mul_ps(uij, uij);
-                uij3           = _mm_mul_ps(uij2, uij);
-                lij2           = _mm_mul_ps(lij, lij);
-                lij3           = _mm_mul_ps(lij2, lij);
-                diff2          = _mm_sub_ps(uij2, lij2);
-                lij_inv        = gmx_mm_invsqrt_ps(lij2);
-                sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
-                sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
-                prod           = _mm_mul_ps(onefourth, sk2_rinv);
-                logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
-                t1             = _mm_sub_ps(lij, uij);
-                t2             = _mm_mul_ps(diff2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
-                                                       prod));
-                t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
-                t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
-                t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
-                t4            = _mm_and_ps(t4, obc_mask3);
-                t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-                sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
-                t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
-                                           _mm_mul_ps(prod, lij3));
-                t1            = _mm_sub_ps(t1,
-                                           _mm_mul_ps(onefourth,
-                                                      _mm_add_ps(_mm_mul_ps(lij, rinv),
-                                                                 _mm_mul_ps(lij3, r))));
-                t2            = _mm_mul_ps(onefourth,
-                                           _mm_add_ps(_mm_mul_ps(uij, rinv),
-                                                      _mm_mul_ps(uij3, r)));
-                t2            = _mm_sub_ps(t2,
-                                           _mm_add_ps(_mm_mul_ps(half, uij2),
-                                                      _mm_mul_ps(prod, uij3)));
-                t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
-                                           _mm_mul_ps(rinv, rinv));
-                t3            = _mm_sub_ps(t3,
-                                           _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
-                                                      _mm_add_ps(one,
-                                                                 _mm_mul_ps(sk2_rinv, rinv))));
-                t1            = _mm_mul_ps(rinv,
-                                           _mm_add_ps(_mm_mul_ps(dlij, t1),
-                                                      _mm_add_ps(t2, t3)));
-                dadx1         = _mm_and_ps(t1, obc_mask1);
-            }
-            else
-            {
-                dadx1         = _mm_setzero_ps();
-            }
-
-            /* Evaluate influence of atom ai -> aj */
-            t1            = _mm_add_ps(r, sk_ai);
-            obc_mask1     = _mm_cmplt_ps(raj, t1);
-            obc_mask1     = _mm_and_ps(obc_mask1, mask);
-
-            if (_mm_movemask_ps(obc_mask1))
-            {
-                t2            = _mm_sub_ps(r, sk_ai);
-                t3            = _mm_sub_ps(sk_ai, r);
-                obc_mask2     = _mm_cmplt_ps(raj, t2);
-                obc_mask3     = _mm_cmplt_ps(raj, t3);
-
-                uij           = gmx_mm_inv_ps(t1);
-                lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
-                                          _mm_andnot_ps(obc_mask2, raj_inv));
-                dlij          = _mm_and_ps(one, obc_mask2);
-                uij2          = _mm_mul_ps(uij, uij);
-                uij3          = _mm_mul_ps(uij2, uij);
-                lij2          = _mm_mul_ps(lij, lij);
-                lij3          = _mm_mul_ps(lij2, lij);
-                diff2         = _mm_sub_ps(uij2, lij2);
-                lij_inv       = gmx_mm_invsqrt_ps(lij2);
-                sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
-                prod          = _mm_mul_ps(onefourth, sk2_rinv);
-                logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
-                t1            = _mm_sub_ps(lij, uij);
-                t2            = _mm_mul_ps(diff2,
-                                           _mm_sub_ps(_mm_mul_ps(onefourth, r),
-                                                      prod));
-                t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
-                t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
-                t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
-                t4            = _mm_and_ps(t4, obc_mask3);
-                t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
-
-                tmp           = _mm_and_ps(t1, obc_mask1);
-
-                t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
-                                           _mm_mul_ps(prod, lij3));
-                t1            = _mm_sub_ps(t1,
-                                           _mm_mul_ps(onefourth,
-                                                      _mm_add_ps(_mm_mul_ps(lij, rinv),
-                                                                 _mm_mul_ps(lij3, r))));
-                t2            = _mm_mul_ps(onefourth,
-                                           _mm_add_ps(_mm_mul_ps(uij, rinv),
-                                                      _mm_mul_ps(uij3, r)));
-                t2            = _mm_sub_ps(t2,
-                                           _mm_add_ps(_mm_mul_ps(half, uij2),
-                                                      _mm_mul_ps(prod, uij3)));
-                t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
-                                           _mm_mul_ps(rinv, rinv));
-                t3            = _mm_sub_ps(t3,
-                                           _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
-                                                      _mm_add_ps(one,
-                                                                 _mm_mul_ps(sk2_rinv, rinv))));
-                t1            = _mm_mul_ps(rinv,
-                                           _mm_add_ps(_mm_mul_ps(dlij, t1),
-                                                      _mm_add_ps(t2, t3)));
-                dadx2         = _mm_and_ps(t1, obc_mask1);
-            }
-            else
-            {
-                dadx2         = _mm_setzero_ps();
-                tmp           = _mm_setzero_ps();
-            }
-
-            _mm_store_ps(dadx, dadx1);
-            dadx += 4;
-            _mm_store_ps(dadx, dadx2);
-            dadx += 4;
-
-            if (offset == 1)
-            {
-                GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
-            }
-            else if (offset == 2)
-            {
-                GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
-            }
-            else
-            {
-                /* offset must be 3 */
-                GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
-            }
-
-        }
-        GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
-
-    }
-
-    /* Parallel summations */
-    if (DOMAINDECOMP(cr))
-    {
-        dd_atom_sum_real(cr->dd, work);
-    }
-
-    if (gb_algorithm == egbHCT)
-    {
-        /* HCT */
-        for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
-        {
-            if (born->use[i] != 0)
-            {
-                rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
-                sum     = 1.0/rr - work[i];
-                min_rad = rr + doffset;
-                rad     = 1.0/sum;
-
-                born->bRad[i]   = rad > min_rad ? rad : min_rad;
-                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-            }
-        }
-
-        /* Extra communication required for DD */
-        if (DOMAINDECOMP(cr))
-        {
-            dd_atom_spread_real(cr->dd, born->bRad);
-            dd_atom_spread_real(cr->dd, fr->invsqrta);
-        }
-    }
-    else
-    {
-        /* OBC */
-        for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
-        {
-            if (born->use[i] != 0)
-            {
-                rr      = top->atomtypes.gb_radius[md->typeA[i]];
-                rr_inv2 = 1.0/rr;
-                rr      = rr-doffset;
-                rr_inv  = 1.0/rr;
-                sum     = rr * work[i];
-                sum2    = sum  * sum;
-                sum3    = sum2 * sum;
-
-                tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
-                born->bRad[i] = rr_inv - tsum*rr_inv2;
-                born->bRad[i] = 1.0 / born->bRad[i];
-
-                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-
-                tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
-                born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
-            }
-        }
-        /* Extra (local) communication required for DD */
-        if (DOMAINDECOMP(cr))
-        {
-            dd_atom_spread_real(cr->dd, born->bRad);
-            dd_atom_spread_real(cr->dd, fr->invsqrta);
-            dd_atom_spread_real(cr->dd, born->drobc);
-        }
-    }
-
-
-
-    return 0;
-}
-
-
-
-float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
-                                    float *x, float *f, float *fshift, float *shiftvec,
-                                    int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
-{
-    int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
-    int          jnrA, jnrB, jnrC, jnrD;
-    int          j3A, j3B, j3C, j3D;
-    int          jnrE, jnrF, jnrG, jnrH;
-    int          j3E, j3F, j3G, j3H;
-    int       *  jjnr;
-
-    float        rbi, shX, shY, shZ;
-    float       *rb;
-
-    __m128       ix, iy, iz;
-    __m128       jx, jy, jz;
-    __m128       jxB, jyB, jzB;
-    __m128       fix, fiy, fiz;
-    __m128       dx, dy, dz;
-    __m128       tx, ty, tz;
-    __m128       dxB, dyB, dzB;
-    __m128       txB, tyB, tzB;
-
-    __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
-    __m128       xmm1, xmm2, xmm3;
-
-    const __m128 two = _mm_set1_ps(2.0f);
-
-    rb     = born->work;
-
-    jjnr   = nl->jjnr;
-
-    /* Loop to get the proper form for the Born radius term, sse style */
-    offset = natoms%4;
-
-    n0 = 0;
-    n1 = natoms;
-
-    if (gb_algorithm == egbSTILL)
-    {
-        for (i = n0; i < n1; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
-        }
-    }
-    else if (gb_algorithm == egbHCT)
-    {
-        for (i = n0; i < n1; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = rbi * rbi * dvda[i];
-        }
-    }
-    else if (gb_algorithm == egbOBC)
-    {
-        for (i = n0; i < n1; i++)
-        {
-            rbi   = born->bRad[i];
-            rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
-        }
-    }
-
-    jz = _mm_setzero_ps();
-
-    n = j3A = j3B = j3C = j3D = 0;
-
-    for (i = 0; i < nl->nri; i++)
-    {
-        ii     = nl->iinr[i];
-        ii3    = ii*3;
-        is3    = 3*nl->shift[i];
-        shX    = shiftvec[is3];
-        shY    = shiftvec[is3+1];
-        shZ    = shiftvec[is3+2];
-        nj0    = nl->jindex[i];
-        nj1    = nl->jindex[i+1];
-
-        ix     = _mm_set1_ps(shX+x[ii3+0]);
-        iy     = _mm_set1_ps(shY+x[ii3+1]);
-        iz     = _mm_set1_ps(shZ+x[ii3+2]);
-
-        offset = (nj1-nj0)%4;
-
-        rbai   = _mm_load1_ps(rb+ii);
-        fix    = _mm_setzero_ps();
-        fiy    = _mm_setzero_ps();
-        fiz    = _mm_setzero_ps();
-
-
-        for (k = nj0; k < nj1-offset; k += 4)
-        {
-            jnrA        = jjnr[k];
-            jnrB        = jjnr[k+1];
-            jnrC        = jjnr[k+2];
-            jnrD        = jjnr[k+3];
-
-            j3A         = 3*jnrA;
-            j3B         = 3*jnrB;
-            j3C         = 3*jnrC;
-            j3D         = 3*jnrD;
-
-            GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
-
-            dx          = _mm_sub_ps(ix, jx);
-            dy          = _mm_sub_ps(iy, jy);
-            dz          = _mm_sub_ps(iz, jz);
-
-            GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
-
-            /* load chain rule terms for j1-4 */
-            f_gb        = _mm_load_ps(dadx);
-            dadx       += 4;
-            f_gb_ai     = _mm_load_ps(dadx);
-            dadx       += 4;
-
-            /* calculate scalar force */
-            f_gb    = _mm_mul_ps(f_gb, rbai);
-            f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
-            f_gb    = _mm_add_ps(f_gb, f_gb_ai);
-
-            tx     = _mm_mul_ps(f_gb, dx);
-            ty     = _mm_mul_ps(f_gb, dy);
-            tz     = _mm_mul_ps(f_gb, dz);
-
-            fix    = _mm_add_ps(fix, tx);
-            fiy    = _mm_add_ps(fiy, ty);
-            fiz    = _mm_add_ps(fiz, tz);
-
-            GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
-        }
-
-        /*deal with odd elements */
-        if (offset != 0)
-        {
-            if (offset == 1)
-            {
-                jnrA        = jjnr[k];
-                j3A         = 3*jnrA;
-                GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
-                GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
-            }
-            else if (offset == 2)
-            {
-                jnrA        = jjnr[k];
-                jnrB        = jjnr[k+1];
-                j3A         = 3*jnrA;
-                j3B         = 3*jnrB;
-                GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
-                GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
-            }
-            else
-            {
-                /* offset must be 3 */
-                jnrA        = jjnr[k];
-                jnrB        = jjnr[k+1];
-                jnrC        = jjnr[k+2];
-                j3A         = 3*jnrA;
-                j3B         = 3*jnrB;
-                j3C         = 3*jnrC;
-                GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
-                GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
-            }
-
-            dx          = _mm_sub_ps(ix, jx);
-            dy          = _mm_sub_ps(iy, jy);
-            dz          = _mm_sub_ps(iz, jz);
-
-            /* load chain rule terms for j1-4 */
-            f_gb        = _mm_load_ps(dadx);
-            dadx       += 4;
-            f_gb_ai     = _mm_load_ps(dadx);
-            dadx       += 4;
-
-            /* calculate scalar force */
-            f_gb    = _mm_mul_ps(f_gb, rbai);
-            f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
-            f_gb    = _mm_add_ps(f_gb, f_gb_ai);
-
-            tx     = _mm_mul_ps(f_gb, dx);
-            ty     = _mm_mul_ps(f_gb, dy);
-            tz     = _mm_mul_ps(f_gb, dz);
-
-            fix    = _mm_add_ps(fix, tx);
-            fiy    = _mm_add_ps(fiy, ty);
-            fiz    = _mm_add_ps(fiz, tz);
-
-            if (offset == 1)
-            {
-                GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
-            }
-            else if (offset == 2)
-            {
-                GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
-            }
-            else
-            {
-                /* offset must be 3 */
-                GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
-            }
-        }
-
-        /* fix/fiy/fiz now contain four partial force terms, that all should be
-         * added to the i particle forces and shift forces.
-         */
-        gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
-    }
-
-    return 0;
-}
-
-
-#else
-/* keep compiler happy */
-int genborn_sse_dummy;
-
-#endif /* SSE intrinsics available */
diff --git a/src/gromacs/mdlib/genborn_sse2_single.h b/src/gromacs/mdlib/genborn_sse2_single.h
deleted file mode 100644 (file)
index 6753e0e..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _genborn_sse_h
-#define _genborn_sse_h
-
-#include "gromacs/legacyheaders/typedefs.h"
-
-float
-calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
-                              float *xd, float *f, float *fshift, float *shift_vec,
-                              int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md);
-
-int
-calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
-                              float *x, t_nblist *nl, gmx_genborn_t *born);
-
-int
-calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
-                                float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm);
-
-#endif /* _genborn_sse_h */