Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_allvsall_sse2_single.c
index d8bced605a93919507630a068ad05681faf4cf6c..8c3ce47c99fbb18cf09bbc58678599efa3c1b575 100644 (file)
@@ -1,49 +1,52 @@
 /*
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
+ * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team
+ * Copyright (c) 2001-2009, The GROMACS Development Team.
+ * Copyright (c) 2012,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
  *
- * Gromacs is a library for molecular simulation and trajectory analysis,
- * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
- * a full list of developers and information, check out http://www.gromacs.org
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * This program is free software; you can redistribute it and/or modify it under 
- * the terms of the GNU Lesser General Public License as published by the Free 
- * Software Foundation; either version 2 of the License, or (at your option) any 
- * later version.
- * As a special exception, you may use this file as part of a free software
- * library without restriction.  Specifically, if other files instantiate
- * templates or use macros or inline functions from this file, or you compile
- * this file and link it with other files to produce an executable, this
- * file does not by itself cause the resulting executable to be covered by
- * the GNU Lesser General Public License.  
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  *
- * In plain-speak: do not worry about classes/macros/templates either - only
- * changes to the library have to be LGPL, not an application linking with it.
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
  *
- * To help fund GROMACS development, we humbly ask that you cite
- * the papers people have written on it - you can find them on the website!
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
 
 #include <math.h>
-#include "types/simple.h"
 
-#include "vec.h"
-#include "smalloc.h"
+#include "gromacs/legacyheaders/genborn.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/genborn_allvsall.h"
+#include "gromacs/utility/smalloc.h"
 
-#include "partdec.h"
-#include "network.h"
-#include "physics.h"
-#include "genborn.h"
-#include "genborn_allvsall.h"
-
-#if 0 && defined (GMX_X86_SSE2)
+#if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
 
 #include <gmx_sse2_single.h>
 
@@ -60,7 +63,7 @@
 
 
 
-typedef struct 
+typedef struct
 {
     int *      jindex_gb;
     int **     prologue_mask_gb;
@@ -73,17 +76,17 @@ typedef struct
     real *     y_align;
     real *     z_align;
     real *     fx_align;
-       real *     fy_align;
-       real *     fz_align;        
-} 
+    real *     fy_align;
+    real *     fz_align;
+}
 gmx_allvsallgb2_data_t;
 
 
-static int 
-calc_maxoffset(int i,int natoms)
+static int
+calc_maxoffset(int i, int natoms)
 {
     int maxoffset;
-    
+
     if ((natoms % 2) == 1)
     {
         /* Odd number of atoms, easy */
@@ -96,22 +99,22 @@ calc_maxoffset(int i,int natoms)
         {
             if ((i % 2) == 0)
             {
-                maxoffset=natoms/2;
+                maxoffset = natoms/2;
             }
             else
             {
-                maxoffset=natoms/2-1;
+                maxoffset = natoms/2-1;
             }
         }
         else
         {
             if ((i % 2) == 1)
             {
-                maxoffset=natoms/2;
+                maxoffset = natoms/2;
             }
             else
             {
-                maxoffset=natoms/2-1;
+                maxoffset = natoms/2-1;
             }
         }
     }
@@ -120,40 +123,40 @@ calc_maxoffset(int i,int natoms)
         /* natoms/2 = odd */
         if ((i % 2) == 0)
         {
-            maxoffset=natoms/2;
+            maxoffset = natoms/2;
         }
         else
         {
-            maxoffset=natoms/2-1;
+            maxoffset = natoms/2-1;
         }
     }
-    
+
     return maxoffset;
 }
 
 static void
-setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
-                                t_ilist *                  ilist,
-                                int                        start,
-                                int                        end,
-                                int                        natoms,
+setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
+                                t_ilist     *                  ilist,
+                                int                            start,
+                                int                            end,
+                                int                            natoms,
                                 gmx_bool                       bInclude12,
                                 gmx_bool                       bInclude13,
                                 gmx_bool                       bInclude14)
 {
-    int i,j,k,tp;
-    int a1,a2;
-    int ni0,ni1,nj0,nj1,nj;
-    int imin,imax,iexcl;
-    int max_offset;
-    int max_excl_offset;
-    int firstinteraction;
-    int ibase;
+    int   i, j, k, tp;
+    int   a1, a2;
+    int   ni0, ni1, nj0, nj1, nj;
+    int   imin, imax, iexcl;
+    int   max_offset;
+    int   max_excl_offset;
+    int   firstinteraction;
+    int   ibase;
     int  *pi;
-    
+
     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
-     * whether they should interact or not. 
+     * whether they should interact or not.
      *
      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
@@ -161,122 +164,122 @@ setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
      * which we need to check exclusions, and the end point.
      * This way we only have to allocate a short exclusion mask per i atom.
      */
-    
+
     ni0 = (start/UNROLLI)*UNROLLI;
     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
-    
+
     /* Set the interaction mask to only enable the i atoms we want to include */
-    snew(pi,natoms+UNROLLI+2*SIMD_WIDTH);
+    snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-    for(i=0;i<natoms+UNROLLI;i++)
+    for (i = 0; i < natoms+UNROLLI; i++)
     {
-        aadata->imask[i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
+        aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
     }
-    
+
     /* Allocate memory for our modified jindex array */
-    snew(aadata->jindex_gb,4*(natoms+UNROLLI));
-    for(i=0;i<4*(natoms+UNROLLI);i++)
+    snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
+    for (i = 0; i < 4*(natoms+UNROLLI); i++)
     {
         aadata->jindex_gb[i] = 0;
     }
-    
+
     /* Create the exclusion masks for the prologue part */
-       snew(aadata->prologue_mask_gb,natoms+UNROLLI); /* list of pointers */
-       
+    snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
+
     /* First zero everything to avoid uninitialized data */
-    for(i=0;i<natoms+UNROLLI;i++)
+    for (i = 0; i < natoms+UNROLLI; i++)
     {
         aadata->prologue_mask_gb[i] = NULL;
     }
-    
+
     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
-    for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
-       {
+    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
+    {
         max_excl_offset = -1;
-        
+
         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
-        
+
         /* Which atom is the first we (might) interact with? */
         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
-        for(i=ibase;i<imax;i++)
+        for (i = ibase; i < imax; i++)
         {
             /* Before exclusions, which atom is the first we (might) interact with? */
             firstinteraction = i+1;
-            max_offset = calc_maxoffset(i,natoms);
+            max_offset       = calc_maxoffset(i, natoms);
 
-            if(!bInclude12)
+            if (!bInclude12)
             {
-                for(j=0;j<ilist[F_GB12].nr;j+=3)
+                for (j = 0; j < ilist[F_GB12].nr; j += 3)
                 {
                     a1 = ilist[F_GB12].iatoms[j+1];
                     a2 = ilist[F_GB12].iatoms[j+2];
 
-                    if(a1==i)
+                    if (a1 == i)
                     {
                         k = a2;
                     }
-                    else if(a2==i)
+                    else if (a2 == i)
                     {
                         k = a1;
                     }
-                    else 
+                    else
                     {
                         continue;
                     }
 
-                    if(k==firstinteraction)
+                    if (k == firstinteraction)
                     {
                         firstinteraction++;
                     }
                 }
             }
-            if(!bInclude13)
+            if (!bInclude13)
             {
-                for(j=0;j<ilist[F_GB13].nr;j+=3)
+                for (j = 0; j < ilist[F_GB13].nr; j += 3)
                 {
                     a1 = ilist[F_GB13].iatoms[j+1];
                     a2 = ilist[F_GB13].iatoms[j+2];
-                    
-                    if(a1==i)
+
+                    if (a1 == i)
                     {
                         k = a2;
                     }
-                    else if(a2==i)
+                    else if (a2 == i)
                     {
                         k = a1;
                     }
-                    else 
+                    else
                     {
                         continue;
                     }
-                    
-                    if(k==firstinteraction)
+
+                    if (k == firstinteraction)
                     {
                         firstinteraction++;
                     }
                 }
             }
-            if(!bInclude14)
+            if (!bInclude14)
             {
-                for(j=0;j<ilist[F_GB14].nr;j+=3)
+                for (j = 0; j < ilist[F_GB14].nr; j += 3)
                 {
                     a1 = ilist[F_GB14].iatoms[j+1];
                     a2 = ilist[F_GB14].iatoms[j+2];
-                    if(a1==i)
+                    if (a1 == i)
                     {
                         k = a2;
                     }
-                    else if(a2==i)
+                    else if (a2 == i)
                     {
                         k = a1;
                     }
-                    else 
+                    else
                     {
                         continue;
                     }
-                    
-                    if(k==firstinteraction)
+
+                    if (k == firstinteraction)
                     {
                         firstinteraction++;
                     }
@@ -287,123 +290,123 @@ setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
         /* round down to j unrolling factor */
         imin = (imin/UNROLLJ)*UNROLLJ;
 
-        for(i=ibase;i<imax;i++)
+        for (i = ibase; i < imax; i++)
         {
-            max_offset = calc_maxoffset(i,natoms);
-            
-            if(!bInclude12)
+            max_offset = calc_maxoffset(i, natoms);
+
+            if (!bInclude12)
             {
-                for(j=0;j<ilist[F_GB12].nr;j+=3)
+                for (j = 0; j < ilist[F_GB12].nr; j += 3)
                 {
                     a1 = ilist[F_GB12].iatoms[j+1];
                     a2 = ilist[F_GB12].iatoms[j+2];
-                    
-                    if(a1==i)
+
+                    if (a1 == i)
                     {
                         k = a2;
                     }
-                    else if(a2==i)
+                    else if (a2 == i)
                     {
                         k = a1;
                     }
-                    else 
+                    else
                     {
                         continue;
                     }
-                    
-                    if(k<imin)
+
+                    if (k < imin)
                     {
                         k += natoms;
                     }
-                    
-                    if(k>i+max_offset)
+
+                    if (k > i+max_offset)
                     {
                         continue;
                     }
-                    
+
                     k = k - imin;
-                    
-                    if( k+natoms <= max_offset )
+
+                    if (k+natoms <= max_offset)
                     {
-                        k+=natoms;
+                        k += natoms;
                     }
                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
                 }
             }
-            if(!bInclude13)
+            if (!bInclude13)
             {
-                for(j=0;j<ilist[F_GB13].nr;j+=3)
+                for (j = 0; j < ilist[F_GB13].nr; j += 3)
                 {
                     a1 = ilist[F_GB13].iatoms[j+1];
                     a2 = ilist[F_GB13].iatoms[j+2];
-                    
-                    if(a1==i)
+
+                    if (a1 == i)
                     {
                         k = a2;
                     }
-                    else if(a2==i)
+                    else if (a2 == i)
                     {
                         k = a1;
                     }
-                    else 
+                    else
                     {
                         continue;
                     }
-                    
-                    if(k<imin)
+
+                    if (k < imin)
                     {
                         k += natoms;
                     }
-                    
-                    if(k>i+max_offset)
+
+                    if (k > i+max_offset)
                     {
                         continue;
                     }
-                    
+
                     k = k - imin;
-                    
-                    if( k+natoms <= max_offset )
+
+                    if (k+natoms <= max_offset)
                     {
-                        k+=natoms;
+                        k += natoms;
                     }
                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
                 }
             }
-            if(!bInclude14)
+            if (!bInclude14)
             {
-                for(j=0;j<ilist[F_GB14].nr;j+=3)
+                for (j = 0; j < ilist[F_GB14].nr; j += 3)
                 {
                     a1 = ilist[F_GB14].iatoms[j+1];
                     a2 = ilist[F_GB14].iatoms[j+2];
 
-                    if(a1==i)
+                    if (a1 == i)
                     {
                         k = a2;
                     }
-                    else if(a2==i)
+                    else if (a2 == i)
                     {
                         k = a1;
                     }
-                    else 
+                    else
                     {
                         continue;
                     }
-                    
-                    if(k<imin)
+
+                    if (k < imin)
                     {
                         k += natoms;
                     }
-                    
-                    if(k>i+max_offset)
+
+                    if (k > i+max_offset)
                     {
                         continue;
                     }
-                    
+
                     k = k - imin;
-                    
-                    if( k+natoms <= max_offset )
+
+                    if (k+natoms <= max_offset)
                     {
-                        k+=natoms;
+                        k += natoms;
                     }
                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
                 }
@@ -414,35 +417,35 @@ setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
         max_excl_offset++;
         /* round up to j unrolling factor */
         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
-        
+
         /* Set all the prologue masks length to this value (even for i>end) */
-        for(i=ibase;i<ibase+UNROLLI;i++)
+        for (i = ibase; i < ibase+UNROLLI; i++)
         {
             aadata->jindex_gb[4*i]   = imin;
             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
-        }        
+        }
     }
-     
+
     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
-    for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
-    {      
-        for(i=ibase;i<ibase+UNROLLI;i++)
+    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
+    {
+        for (i = ibase; i < ibase+UNROLLI; i++)
         {
-            nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
+            nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
             imin = aadata->jindex_gb[4*i];
-            
+
             /* Allocate aligned memory */
-            snew(pi,nj+2*SIMD_WIDTH);
+            snew(pi, nj+2*SIMD_WIDTH);
             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-            
-            max_offset = calc_maxoffset(i,natoms);
-            
+
+            max_offset = calc_maxoffset(i, natoms);
+
             /* Include interactions i+1 <= j < i+maxoffset */
-            for(k=0;k<nj;k++)
+            for (k = 0; k < nj; k++)
             {
                 j = imin + k;
-                
-                if( (j>i) && (j<=i+max_offset) )
+
+                if ( (j > i) && (j <= i+max_offset) )
                 {
                     aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
                 }
@@ -451,120 +454,120 @@ setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
                     aadata->prologue_mask_gb[i][k] = 0;
                 }
             }
-            
+
             /* Clear out the explicit exclusions */
-            if(i<end)
+            if (i < end)
             {
-                if(!bInclude12)
+                if (!bInclude12)
                 {
-                    for(j=0;j<ilist[F_GB12].nr;j+=3)
+                    for (j = 0; j < ilist[F_GB12].nr; j += 3)
                     {
                         a1 = ilist[F_GB12].iatoms[j+1];
                         a2 = ilist[F_GB12].iatoms[j+2];
-                        
-                        if(a1==i)
+
+                        if (a1 == i)
                         {
                             k = a2;
                         }
-                        else if(a2==i)
+                        else if (a2 == i)
                         {
                             k = a1;
                         }
-                        else 
+                        else
                         {
                             continue;
                         }
-                        
-                        if(k>i+max_offset)
+
+                        if (k > i+max_offset)
                         {
                             continue;
                         }
                         k = k-i;
-                        
-                        if( k+natoms <= max_offset )
+
+                        if (k+natoms <= max_offset)
                         {
-                            k+=natoms;
+                            k += natoms;
                         }
-                        
+
                         k = k+i-imin;
-                        if(k>=0)
-                        {                        
+                        if (k >= 0)
+                        {
                             aadata->prologue_mask_gb[i][k] = 0;
                         }
                     }
                 }
-                if(!bInclude13)
+                if (!bInclude13)
                 {
-                    for(j=0;j<ilist[F_GB13].nr;j+=3)
+                    for (j = 0; j < ilist[F_GB13].nr; j += 3)
                     {
                         a1 = ilist[F_GB13].iatoms[j+1];
                         a2 = ilist[F_GB13].iatoms[j+2];
-                        if(a1==i)
+
+                        if (a1 == i)
                         {
                             k = a2;
                         }
-                        else if(a2==i)
+                        else if (a2 == i)
                         {
                             k = a1;
                         }
-                        else 
+                        else
                         {
                             continue;
                         }
-                        
-                        if(k>i+max_offset)
+
+                        if (k > i+max_offset)
                         {
                             continue;
                         }
                         k = k-i;
-                        
-                        if( k+natoms <= max_offset )
+
+                        if (k+natoms <= max_offset)
                         {
-                            k+=natoms;
+                            k += natoms;
                         }
-                        
+
                         k = k+i-imin;
-                        if(k>=0)
-                        {                        
+                        if (k >= 0)
+                        {
                             aadata->prologue_mask_gb[i][k] = 0;
                         }
                     }
                 }
-                if(!bInclude14)
+                if (!bInclude14)
                 {
-                    for(j=0;j<ilist[F_GB14].nr;j+=3)
+                    for (j = 0; j < ilist[F_GB14].nr; j += 3)
                     {
                         a1 = ilist[F_GB14].iatoms[j+1];
                         a2 = ilist[F_GB14].iatoms[j+2];
 
-                        if(a1==i)
+                        if (a1 == i)
                         {
                             k = a2;
                         }
-                        else if(a2==i)
+                        else if (a2 == i)
                         {
                             k = a1;
                         }
-                        else 
+                        else
                         {
                             continue;
                         }
-                        
-                        if(k>i+max_offset)
+
+                        if (k > i+max_offset)
                         {
                             continue;
                         }
                         k = k-i;
-                        
-                        if( k+natoms <= max_offset )
+
+                        if (k+natoms <= max_offset)
                         {
-                            k+=natoms;
+                            k += natoms;
                         }
-                        
+
                         k = k+i-imin;
-                        if(k>=0)
-                        {                        
+                        if (k >= 0)
+                        {
                             aadata->prologue_mask_gb[i][k] = 0;
                         }
                     }
@@ -572,34 +575,34 @@ setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
             }
         }
     }
-    
+
     /* Construct the epilogue mask - this just contains the check for maxoffset */
-    snew(aadata->epilogue_mask,natoms+UNROLLI);
-    
+    snew(aadata->epilogue_mask, natoms+UNROLLI);
+
     /* First zero everything to avoid uninitialized data */
-    for(i=0;i<natoms+UNROLLI;i++)
+    for (i = 0; i < natoms+UNROLLI; i++)
     {
         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
-        aadata->epilogue_mask[i] = NULL;
+        aadata->epilogue_mask[i]    = NULL;
     }
-    
-    for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
-    {      
+
+    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
+    {
         /* Find the lowest index for which we need to use the epilogue */
-        imin = ibase;
-        max_offset = calc_maxoffset(imin,natoms);
+        imin       = ibase;
+        max_offset = calc_maxoffset(imin, natoms);
 
         imin = imin + 1 + max_offset;
-        
+
         /* Find largest index for which we need to use the epilogue */
         imax = ibase + UNROLLI-1;
-        imax = (imax < end) ? imax : end; 
-        
-        max_offset = calc_maxoffset(imax,natoms);
-        imax = imax + 1 + max_offset + UNROLLJ - 1;
-        
-        for(i=ibase;i<ibase+UNROLLI;i++)
+        imax = (imax < end) ? imax : end;
+
+        max_offset = calc_maxoffset(imax, natoms);
+        imax       = imax + 1 + max_offset + UNROLLJ - 1;
+
+        for (i = ibase; i < ibase+UNROLLI; i++)
         {
             /* Start of epilogue - round down to j tile limit */
             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
@@ -611,23 +614,23 @@ setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
         }
     }
-    
+
     /* And fill it with data... */
-    
-    for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
+
+    for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
     {
-        for(i=ibase;i<ibase+UNROLLI;i++)
+        for (i = ibase; i < ibase+UNROLLI; i++)
         {
-            
+
             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
-            
+
             /* Allocate aligned memory */
-            snew(pi,nj+2*SIMD_WIDTH);
+            snew(pi, nj+2*SIMD_WIDTH);
             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
-            
-            max_offset = calc_maxoffset(i,natoms);
-            
-            for(k=0;k<nj;k++)
+
+            max_offset = calc_maxoffset(i, natoms);
+
+            for (k = 0; k < nj; k++)
             {
                 j = aadata->jindex_gb[4*i+2] + k;
                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
@@ -638,80 +641,80 @@ setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
 
 
 static void
-genborn_allvsall_setup(gmx_allvsallgb2_data_t **  p_aadata,
-                       gmx_localtop_t *           top,
-                       gmx_genborn_t *            born,
-                       t_mdatoms *                mdatoms,
-                       real                       radius_offset,
-                       int                        gb_algorithm,
+genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
+                       gmx_localtop_t     *           top,
+                       gmx_genborn_t     *            born,
+                       t_mdatoms     *                mdatoms,
+                       real                           radius_offset,
+                       int                            gb_algorithm,
                        gmx_bool                       bInclude12,
                        gmx_bool                       bInclude13,
                        gmx_bool                       bInclude14)
 {
-       int i,j,idx;
-    int natoms;
-       gmx_allvsallgb2_data_t *aadata;
-    real *p;
-    
+    int                     i, j, idx;
+    int                     natoms;
+    gmx_allvsallgb2_data_t *aadata;
+    real                   *p;
+
     natoms = mdatoms->nr;
-    
-       snew(aadata,1);
-       *p_aadata = aadata;
-    
-       snew(p,2*natoms+2*SIMD_WIDTH);
-       aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-       snew(p,2*natoms+2*SIMD_WIDTH);
-       aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-       snew(p,2*natoms+2*SIMD_WIDTH);
-       aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-       snew(p,2*natoms+2*SIMD_WIDTH);
-       aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-       snew(p,2*natoms+2*SIMD_WIDTH);
-       aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-       snew(p,2*natoms+2*SIMD_WIDTH);
-       aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-
-    snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
+
+    snew(aadata, 1);
+    *p_aadata = aadata;
+
+    snew(p, 2*natoms+2*SIMD_WIDTH);
+    aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
+    snew(p, 2*natoms+2*SIMD_WIDTH);
+    aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
+    snew(p, 2*natoms+2*SIMD_WIDTH);
+    aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
+    snew(p, 2*natoms+2*SIMD_WIDTH);
+    aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
+    snew(p, 2*natoms+2*SIMD_WIDTH);
+    aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
+    snew(p, 2*natoms+2*SIMD_WIDTH);
+    aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
+
+    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
     aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-        
-    snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
+
+    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
     aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-    
-    snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
+
+    snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
     aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
-    
-    for(i=0;i<mdatoms->nr;i++)
+
+    for (i = 0; i < mdatoms->nr; i++)
     {
         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
-        if(gb_algorithm==egbSTILL)
+        if (gb_algorithm == egbSTILL)
         {
             aadata->workparam[i] = born->vsolv[i];
         }
-        else if(gb_algorithm==egbOBC)
+        else if (gb_algorithm == egbOBC)
         {
             aadata->workparam[i] = born->param[i];
         }
         aadata->work[i]      = 0.0;
     }
-    for(i=0;i<mdatoms->nr;i++)
+    for (i = 0; i < mdatoms->nr; i++)
     {
-        aadata->gb_radius[natoms+i] = aadata->gb_radius[i]; 
+        aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
         aadata->workparam[natoms+i] = aadata->workparam[i];
         aadata->work[natoms+i]      = aadata->work[i];
     }
-    
-    for(i=0;i<2*natoms+SIMD_WIDTH;i++)
-       {
-               aadata->x_align[i] = 0.0;
-               aadata->y_align[i] = 0.0;
-               aadata->z_align[i] = 0.0;
-               aadata->fx_align[i] = 0.0;
-               aadata->fy_align[i] = 0.0;
-               aadata->fz_align[i] = 0.0;
-       }
-    
-    setup_gb_exclusions_and_indices(aadata,top->idef.il,mdatoms->start,mdatoms->start+mdatoms->homenr,mdatoms->nr,
-                                    bInclude12,bInclude13,bInclude14);
+
+    for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
+    {
+        aadata->x_align[i]  = 0.0;
+        aadata->y_align[i]  = 0.0;
+        aadata->z_align[i]  = 0.0;
+        aadata->fx_align[i] = 0.0;
+        aadata->fy_align[i] = 0.0;
+        aadata->fz_align[i] = 0.0;
+    }
+
+    setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
+                                    bInclude12, bInclude13, bInclude14);
 }
 
 
@@ -724,107 +727,107 @@ genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
                                               t_commrec *            cr,
                                               void *                 paadata)
 {
-       gmx_allvsallgb2_data_t *aadata;
-       int        natoms;
-       int        ni0,ni1;
-       int        nj0,nj1,nj2,nj3;
-       int        i,j,k,n;
-    int *      mask;
-    int *      pmask0;
-    int *      pmask1;
-    int *      pmask2;
-    int *      pmask3;
-    int *      emask0;
-    int *      emask1;
-    int *      emask2;
-    int *      emask3;
-    real       ix,iy,iz;
-    real       jx,jy,jz;
-    real       dx,dy,dz;
-    real       rsq,rinv;
-    real       gpi,rai,vai;
-    real       prod_ai;
-    real       irsq,idr4,idr6;
-    real       raj,rvdw,ratio;
-    real       vaj,ccf,dccf,theta,cosq;
-    real       term,prod,icf4,icf6,gpi2,factor,sinq;
-    real *     gb_radius;
-    real *     vsolv;
-    real *     work;
-    real       tmpsum[4];
-    real *     x_align;
-    real *     y_align;
-    real *     z_align;
-    int *      jindex;
-    real *     dadx;
-    
-    __m128     ix_SSE0,iy_SSE0,iz_SSE0;
-    __m128     ix_SSE1,iy_SSE1,iz_SSE1;
-    __m128     ix_SSE2,iy_SSE2,iz_SSE2;
-    __m128     ix_SSE3,iy_SSE3,iz_SSE3;
-    __m128     gpi_SSE0,rai_SSE0,prod_ai_SSE0;
-    __m128     gpi_SSE1,rai_SSE1,prod_ai_SSE1;
-    __m128     gpi_SSE2,rai_SSE2,prod_ai_SSE2;
-    __m128     gpi_SSE3,rai_SSE3,prod_ai_SSE3;
-    __m128     imask_SSE0,jmask_SSE0;
-    __m128     imask_SSE1,jmask_SSE1;
-    __m128     imask_SSE2,jmask_SSE2;
-    __m128     imask_SSE3,jmask_SSE3;
-    __m128     jx_SSE,jy_SSE,jz_SSE;
-    __m128     dx_SSE0,dy_SSE0,dz_SSE0;
-    __m128     dx_SSE1,dy_SSE1,dz_SSE1;
-    __m128     dx_SSE2,dy_SSE2,dz_SSE2;
-    __m128     dx_SSE3,dy_SSE3,dz_SSE3;
-    __m128     rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
-    __m128     rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
-    __m128     rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
-    __m128     rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
-    __m128     raj_SSE,vaj_SSE,prod_SSE;
-    __m128     rvdw_SSE0,ratio_SSE0;
-    __m128     rvdw_SSE1,ratio_SSE1;
-    __m128     rvdw_SSE2,ratio_SSE2;
-    __m128     rvdw_SSE3,ratio_SSE3;
-    __m128     theta_SSE0,sinq_SSE0,cosq_SSE0,term_SSE0;
-    __m128     theta_SSE1,sinq_SSE1,cosq_SSE1,term_SSE1;
-    __m128     theta_SSE2,sinq_SSE2,cosq_SSE2,term_SSE2;
-    __m128     theta_SSE3,sinq_SSE3,cosq_SSE3,term_SSE3;
-    __m128     ccf_SSE0,dccf_SSE0;
-    __m128     ccf_SSE1,dccf_SSE1;
-    __m128     ccf_SSE2,dccf_SSE2;
-    __m128     ccf_SSE3,dccf_SSE3;
-    __m128     icf4_SSE0,icf6_SSE0;
-    __m128     icf4_SSE1,icf6_SSE1;
-    __m128     icf4_SSE2,icf6_SSE2;
-    __m128     icf4_SSE3,icf6_SSE3;
-    __m128     half_SSE,one_SSE,two_SSE,four_SSE;
-    __m128     still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
-    
+    gmx_allvsallgb2_data_t *aadata;
+    int                     natoms;
+    int                     ni0, ni1;
+    int                     nj0, nj1, nj2, nj3;
+    int                     i, j, k, n;
+    int              *      mask;
+    int              *      pmask0;
+    int              *      pmask1;
+    int              *      pmask2;
+    int              *      pmask3;
+    int              *      emask0;
+    int              *      emask1;
+    int              *      emask2;
+    int              *      emask3;
+    real                    ix, iy, iz;
+    real                    jx, jy, jz;
+    real                    dx, dy, dz;
+    real                    rsq, rinv;
+    real                    gpi, rai, vai;
+    real                    prod_ai;
+    real                    irsq, idr4, idr6;
+    real                    raj, rvdw, ratio;
+    real                    vaj, ccf, dccf, theta, cosq;
+    real                    term, prod, icf4, icf6, gpi2, factor, sinq;
+    real              *     gb_radius;
+    real              *     vsolv;
+    real              *     work;
+    real                    tmpsum[4];
+    real              *     x_align;
+    real              *     y_align;
+    real              *     z_align;
+    int              *      jindex;
+    real              *     dadx;
+
+    __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
+    __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
+    __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
+    __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
+    __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
+    __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
+    __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
+    __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
+    __m128                  imask_SSE0, jmask_SSE0;
+    __m128                  imask_SSE1, jmask_SSE1;
+    __m128                  imask_SSE2, jmask_SSE2;
+    __m128                  imask_SSE3, jmask_SSE3;
+    __m128                  jx_SSE, jy_SSE, jz_SSE;
+    __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
+    __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
+    __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
+    __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
+    __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
+    __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
+    __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
+    __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
+    __m128                  raj_SSE, vaj_SSE, prod_SSE;
+    __m128                  rvdw_SSE0, ratio_SSE0;
+    __m128                  rvdw_SSE1, ratio_SSE1;
+    __m128                  rvdw_SSE2, ratio_SSE2;
+    __m128                  rvdw_SSE3, ratio_SSE3;
+    __m128                  theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
+    __m128                  theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
+    __m128                  theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
+    __m128                  theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
+    __m128                  ccf_SSE0, dccf_SSE0;
+    __m128                  ccf_SSE1, dccf_SSE1;
+    __m128                  ccf_SSE2, dccf_SSE2;
+    __m128                  ccf_SSE3, dccf_SSE3;
+    __m128                  icf4_SSE0, icf6_SSE0;
+    __m128                  icf4_SSE1, icf6_SSE1;
+    __m128                  icf4_SSE2, icf6_SSE2;
+    __m128                  icf4_SSE3, icf6_SSE3;
+    __m128                  half_SSE, one_SSE, two_SSE, four_SSE;
+    __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
+
     natoms              = mdatoms->nr;
-       ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
-       ni1                 = mdatoms->start+mdatoms->homenr;
+    ni0                 = 0;
+    ni1                 = mdatoms->homenr;
 
     n = 0;
-    
+
     aadata = *((gmx_allvsallgb2_data_t **)paadata);
 
-    
-       if(aadata==NULL)
-       {
-               genborn_allvsall_setup(&aadata,top,born,mdatoms,0.0,
-                               egbSTILL,FALSE,FALSE,TRUE);
+
+    if (aadata == NULL)
+    {
+        genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
+                               egbSTILL, FALSE, FALSE, TRUE);
         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
-       }
-        
+    }
+
     x_align = aadata->x_align;
-       y_align = aadata->y_align;
-       z_align = aadata->z_align;
+    y_align = aadata->y_align;
+    z_align = aadata->z_align;
 
     gb_radius = aadata->gb_radius;
     vsolv     = aadata->workparam;
     work      = aadata->work;
     jindex    = aadata->jindex_gb;
     dadx      = fr->dadx;
-    
+
     still_p4_SSE    = _mm_set1_ps(STILL_P4);
     still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
     still_pip5_SSE  = _mm_set1_ps(STILL_PIP5);
@@ -832,41 +835,41 @@ genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
     one_SSE         = _mm_set1_ps(1.0);
     two_SSE         = _mm_set1_ps(2.0);
     four_SSE        = _mm_set1_ps(4.0);
-    
+
     /* This will be summed, so it has to extend to natoms + buffer */
-    for(i=0;i<natoms+1+natoms/2;i++)
+    for (i = 0; i < natoms+1+natoms/2; i++)
     {
         work[i] = 0;
     }
-    
-       for(i=ni0;i<ni1+1+natoms/2;i++)
-       {
-        k = i%natoms;
-               x_align[i]  = x[3*k];
-               y_align[i]  = x[3*k+1];
-               z_align[i]  = x[3*k+2];
+
+    for (i = ni0; i < ni1+1+natoms/2; i++)
+    {
+        k           = i%natoms;
+        x_align[i]  = x[3*k];
+        y_align[i]  = x[3*k+1];
+        z_align[i]  = x[3*k+2];
         work[i]     = 0;
-    }        
-    
-    
-       for(i=ni0; i<ni1; i+=UNROLLI)
-       {
-               /* We assume shifts are NOT used for all-vs-all interactions */
-
-               /* Load i atom data */
-               ix_SSE0          = _mm_load1_ps(x_align+i);
-               iy_SSE0          = _mm_load1_ps(y_align+i);
-               iz_SSE0          = _mm_load1_ps(z_align+i);
-               ix_SSE1          = _mm_load1_ps(x_align+i+1);
-               iy_SSE1          = _mm_load1_ps(y_align+i+1);
-               iz_SSE1          = _mm_load1_ps(z_align+i+1);
-               ix_SSE2          = _mm_load1_ps(x_align+i+2);
-               iy_SSE2          = _mm_load1_ps(y_align+i+2);
-               iz_SSE2          = _mm_load1_ps(z_align+i+2);
-               ix_SSE3          = _mm_load1_ps(x_align+i+3);
-               iy_SSE3          = _mm_load1_ps(y_align+i+3);
-               iz_SSE3          = _mm_load1_ps(z_align+i+3);
-        
+    }
+
+
+    for (i = ni0; i < ni1; i += UNROLLI)
+    {
+        /* We assume shifts are NOT used for all-vs-all interactions */
+
+        /* Load i atom data */
+        ix_SSE0          = _mm_load1_ps(x_align+i);
+        iy_SSE0          = _mm_load1_ps(y_align+i);
+        iz_SSE0          = _mm_load1_ps(z_align+i);
+        ix_SSE1          = _mm_load1_ps(x_align+i+1);
+        iy_SSE1          = _mm_load1_ps(y_align+i+1);
+        iz_SSE1          = _mm_load1_ps(z_align+i+1);
+        ix_SSE2          = _mm_load1_ps(x_align+i+2);
+        iy_SSE2          = _mm_load1_ps(y_align+i+2);
+        iz_SSE2          = _mm_load1_ps(z_align+i+2);
+        ix_SSE3          = _mm_load1_ps(x_align+i+3);
+        iy_SSE3          = _mm_load1_ps(y_align+i+3);
+        iz_SSE3          = _mm_load1_ps(z_align+i+3);
+
         gpi_SSE0         = _mm_setzero_ps();
         gpi_SSE1         = _mm_setzero_ps();
         gpi_SSE2         = _mm_setzero_ps();
@@ -876,506 +879,502 @@ genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
-        
+
         prod_ai_SSE0     = _mm_set1_ps(STILL_P4*vsolv[i]);
         prod_ai_SSE1     = _mm_set1_ps(STILL_P4*vsolv[i+1]);
         prod_ai_SSE2     = _mm_set1_ps(STILL_P4*vsolv[i+2]);
         prod_ai_SSE3     = _mm_set1_ps(STILL_P4*vsolv[i+3]);
-        
-               /* Load limits for loop over neighbors */
-               nj0              = jindex[4*i];
-               nj1              = jindex[4*i+1];
-               nj2              = jindex[4*i+2];
-               nj3              = jindex[4*i+3];
-                        
+
+        /* Load limits for loop over neighbors */
+        nj0              = jindex[4*i];
+        nj1              = jindex[4*i+1];
+        nj2              = jindex[4*i+2];
+        nj3              = jindex[4*i+3];
+
         pmask0           = aadata->prologue_mask_gb[i];
         pmask1           = aadata->prologue_mask_gb[i+1];
         pmask2           = aadata->prologue_mask_gb[i+2];
         pmask3           = aadata->prologue_mask_gb[i+3];
-        emask0           = aadata->epilogue_mask[i]; 
-        emask1           = aadata->epilogue_mask[i+1]; 
-        emask2           = aadata->epilogue_mask[i+2]; 
-        emask3           = aadata->epilogue_mask[i+3]; 
+        emask0           = aadata->epilogue_mask[i];
+        emask1           = aadata->epilogue_mask[i+1];
+        emask2           = aadata->epilogue_mask[i+2];
+        emask3           = aadata->epilogue_mask[i+3];
 
         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
-        
+
         /* Prologue part, including exclusion mask */
-        for(j=nj0; j<nj1; j+=UNROLLJ)
-        {          
+        for (j = nj0; j < nj1; j += UNROLLJ)
+        {
             jmask_SSE0 = _mm_load_ps((real *)pmask0);
             jmask_SSE1 = _mm_load_ps((real *)pmask1);
             jmask_SSE2 = _mm_load_ps((real *)pmask2);
             jmask_SSE3 = _mm_load_ps((real *)pmask3);
-            pmask0 += UNROLLJ;
-            pmask1 += UNROLLJ;
-            pmask2 += UNROLLJ;
-            pmask3 += UNROLLJ;
-            
+            pmask0    += UNROLLJ;
+            pmask1    += UNROLLJ;
+            pmask2    += UNROLLJ;
+            pmask3    += UNROLLJ;
+
             /* load j atom coordinates */
             jx_SSE            = _mm_load_ps(x_align+j);
             jy_SSE            = _mm_load_ps(y_align+j);
             jz_SSE            = _mm_load_ps(z_align+j);
-            
+
             /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
-            
+            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
+            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
+            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
+            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
+            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
+            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
+            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
+            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
+            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
+            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
+            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
+            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
+
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
-            
+            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
+            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
+            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
+            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+
             /* Combine masks */
-            jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
-            jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
-            jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
-            jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
-            
+            jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
+            jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
+            jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
+            jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
+
             /* Calculate 1/r and 1/r2 */
             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-            
+
             /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
-            
-            irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
-            irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
-            irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
-            irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
-            idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
-            idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
-            idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
-            idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
-            idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
-            idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
-            idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
-            idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
-            
+            rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
+            rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
+            rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
+            rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
+
+            irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
+            irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
+            irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
+            irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
+            idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
+            idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
+            idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
+            idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
+            idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
+            idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
+            idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
+            idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
+
             raj_SSE            = _mm_load_ps(gb_radius+j);
             vaj_SSE            = _mm_load_ps(vsolv+j);
-            
-            rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
-            rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
-            rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
-            rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
-            
-            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
-            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
-            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
-
-            ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
-            ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
-            ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
-            theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
-            theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
-            theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
-            gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
-            gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
-            gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
-            gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
-            term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
-            term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
-            term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
-            term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
-            ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
-            ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
-            ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
-            ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
-            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
-                                            _mm_mul_ps(sinq_SSE0,theta_SSE0));
-            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
-                                            _mm_mul_ps(sinq_SSE1,theta_SSE1));
-            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
-                                            _mm_mul_ps(sinq_SSE2,theta_SSE2));
-            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
-                                            _mm_mul_ps(sinq_SSE3,theta_SSE3));
-
-            prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE);
-            icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
-            icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
-            icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
-            icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
-            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
-            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
-            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
-
-            _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
-                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
-                                                         _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
-                                                         _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
-                                                         _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
-            
-            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
-            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
-            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
-            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
+
+            rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
+            rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
+            rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
+            rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
+
+            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
+            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
+            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
+            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
+
+            ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
+            ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
+            ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
+            ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
+            theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
+            theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
+            theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
+            theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
+            gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
+            gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
+            gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
+            gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
+            term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
+            term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
+            term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
+            term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
+            ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
+            ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
+            ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
+            ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
+            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
+                                            _mm_mul_ps(sinq_SSE0, theta_SSE0));
+            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
+                                            _mm_mul_ps(sinq_SSE1, theta_SSE1));
+            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
+                                            _mm_mul_ps(sinq_SSE2, theta_SSE2));
+            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
+                                            _mm_mul_ps(sinq_SSE3, theta_SSE3));
+
+            prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
+            icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
+            icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
+            icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
+            icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
+            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
+            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
+            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
+            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
+
+            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
+                                            gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
+                                                           _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
+                                                           _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
+                                                           _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
+
+            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
+            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
+            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
+            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
 
             /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
-            dadx+=4;            
-            
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
-            dadx+=4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
+            dadx += 4;
+
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
+            dadx += 4;
         }
-                                 
+
         /* Main part, no exclusions */
-        for(j=nj1; j<nj2; j+=UNROLLJ)
-        {               
+        for (j = nj1; j < nj2; j += UNROLLJ)
+        {
             /* load j atom coordinates */
             jx_SSE            = _mm_load_ps(x_align+j);
             jy_SSE            = _mm_load_ps(y_align+j);
             jz_SSE            = _mm_load_ps(z_align+j);
-            
+
             /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
-            
+            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
+            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
+            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
+            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
+            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
+            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
+            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
+            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
+            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
+            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
+            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
+            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
+
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
-            
+            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
+            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
+            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
+            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+
             /* Calculate 1/r and 1/r2 */
             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-            
+
             /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0,imask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1,imask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2,imask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3,imask_SSE3);
-            
-            irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
-            irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
-            irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
-            irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
-            idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
-            idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
-            idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
-            idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
-            idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
-            idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
-            idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
-            idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
-            
+            rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
+            rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
+            rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
+            rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
+
+            irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
+            irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
+            irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
+            irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
+            idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
+            idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
+            idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
+            idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
+            idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
+            idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
+            idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
+            idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
+
             raj_SSE            = _mm_load_ps(gb_radius+j);
-            
-            rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
-            rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
-            rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
-            rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
+
+            rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
+            rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
+            rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
+            rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
             vaj_SSE            = _mm_load_ps(vsolv+j);
-            
-            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
-            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
-            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
-            
-            ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
-            ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
-            ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
-            theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
-            theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
-            theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
-            gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
-            gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
-            gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
-            gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
-            term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
-            term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
-            term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
-            term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
-            ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
-            ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
-            ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
-            ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
-            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
-                                            _mm_mul_ps(sinq_SSE0,theta_SSE0));
-            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
-                                            _mm_mul_ps(sinq_SSE1,theta_SSE1));
-            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
-                                            _mm_mul_ps(sinq_SSE2,theta_SSE2));
-            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
-                                            _mm_mul_ps(sinq_SSE3,theta_SSE3));
-            
-            prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE );
-            icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
-            icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
-            icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
-            icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
-            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
-            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
-            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
-            
-            _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
-                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
-                                                         _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
-                                                         _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
-                                                         _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
-            
-            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
-            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
-            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
-            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
-            
+
+            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
+            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
+            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
+            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
+
+            ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
+            ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
+            ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
+            ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
+            theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
+            theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
+            theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
+            theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
+            gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
+            gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
+            gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
+            gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
+            term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
+            term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
+            term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
+            term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
+            ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
+            ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
+            ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
+            ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
+            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
+                                            _mm_mul_ps(sinq_SSE0, theta_SSE0));
+            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
+                                            _mm_mul_ps(sinq_SSE1, theta_SSE1));
+            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
+                                            _mm_mul_ps(sinq_SSE2, theta_SSE2));
+            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
+                                            _mm_mul_ps(sinq_SSE3, theta_SSE3));
+
+            prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE );
+            icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
+            icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
+            icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
+            icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
+            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
+            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
+            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
+            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
+
+            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
+                                            gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
+                                                           _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
+                                                           _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
+                                                           _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
+
+            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
+            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
+            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
+            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
+
             /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
-            dadx+=4;            
-            
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
-            dadx+=4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
+            dadx += 4;
+
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
+            dadx += 4;
         }
         /* Epilogue part, including exclusion mask */
-        for(j=nj2; j<nj3; j+=UNROLLJ)
-        {                     
+        for (j = nj2; j < nj3; j += UNROLLJ)
+        {
             jmask_SSE0 = _mm_load_ps((real *)emask0);
             jmask_SSE1 = _mm_load_ps((real *)emask1);
             jmask_SSE2 = _mm_load_ps((real *)emask2);
             jmask_SSE3 = _mm_load_ps((real *)emask3);
-            emask0 += UNROLLJ;
-            emask1 += UNROLLJ;
-            emask2 += UNROLLJ;
-            emask3 += UNROLLJ;
-            
+            emask0    += UNROLLJ;
+            emask1    += UNROLLJ;
+            emask2    += UNROLLJ;
+            emask3    += UNROLLJ;
+
             /* load j atom coordinates */
             jx_SSE            = _mm_load_ps(x_align+j);
             jy_SSE            = _mm_load_ps(y_align+j);
             jz_SSE            = _mm_load_ps(z_align+j);
-            
+
             /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
-            
+            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
+            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
+            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
+            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
+            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
+            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
+            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
+            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
+            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
+            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
+            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
+            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
+
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
-            
+            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
+            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
+            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
+            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+
             /* Combine masks */
-            jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
-            jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
-            jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
-            jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
-            
+            jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
+            jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
+            jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
+            jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
+
             /* Calculate 1/r and 1/r2 */
             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-            
+
             /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
-            
-            irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
-            irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
-            irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
-            irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
-            idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
-            idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
-            idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
-            idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
-            idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
-            idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
-            idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
-            idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
-            
+            rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
+            rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
+            rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
+            rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
+
+            irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
+            irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
+            irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
+            irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
+            idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
+            idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
+            idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
+            idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
+            idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
+            idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
+            idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
+            idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
+
             raj_SSE            = _mm_load_ps(gb_radius+j);
             vaj_SSE            = _mm_load_ps(vsolv+j);
-            
-            rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
-            rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
-            rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
-            rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
-            
-            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
-            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
-            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
-            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
-            
-            ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
-            ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
-            ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
-            ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
-            theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
-            theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
-            theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
-            theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
-            gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
-            gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
-            gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
-            gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
-            term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
-            term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
-            term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
-            term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
-            ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
-            ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
-            ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
-            ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
-            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
-                                            _mm_mul_ps(sinq_SSE0,theta_SSE0));
-            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
-                                            _mm_mul_ps(sinq_SSE1,theta_SSE1));
-            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
-                                            _mm_mul_ps(sinq_SSE2,theta_SSE2));
-            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
-                                            _mm_mul_ps(sinq_SSE3,theta_SSE3));
-            
-            prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE);
-            icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
-            icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
-            icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
-            icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
-            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
-            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
-            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
-            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
-            
-            _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
-                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
-                                                         _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
-                                                         _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
-                                                         _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
-            
-            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
-            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
-            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
-            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
-              
+
+            rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
+            rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
+            rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
+            rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
+
+            ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
+            ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
+            ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
+            ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
+
+            ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
+            ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
+            ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
+            ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
+            theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
+            theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
+            theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
+            theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
+            gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
+            gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
+            gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
+            gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
+            term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
+            term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
+            term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
+            term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
+            ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
+            ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
+            ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
+            ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
+            dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
+                                            _mm_mul_ps(sinq_SSE0, theta_SSE0));
+            dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
+                                            _mm_mul_ps(sinq_SSE1, theta_SSE1));
+            dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
+                                            _mm_mul_ps(sinq_SSE2, theta_SSE2));
+            dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
+                                            _mm_mul_ps(sinq_SSE3, theta_SSE3));
+
+            prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
+            icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
+            icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
+            icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
+            icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
+            icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
+            icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
+            icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
+            icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
+
+            _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
+                                            gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
+                                                           _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
+                                                           _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
+                                                           _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
+
+            gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
+            gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
+            gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
+            gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
+
             /* Save ai->aj and aj->ai chain rule terms */
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
-            dadx+=4;            
-            
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
-            dadx+=4;
-            _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
-            dadx+=4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
+            dadx += 4;
+
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
+            dadx += 4;
+            _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
+            dadx += 4;
         }
-        _MM_TRANSPOSE4_PS(gpi_SSE0,gpi_SSE1,gpi_SSE2,gpi_SSE3);
-        gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE1);
-        gpi_SSE2 = _mm_add_ps(gpi_SSE2,gpi_SSE3);
-        gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE2);
+        _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
+        gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
+        gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
+        gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
         _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
-       }    
-    
+    }
+
     /* In case we have written anything beyond natoms, move it back.
      * Never mind that we leave stuff above natoms; that will not
      * be accessed later in the routine.
      * In principle this should be a move rather than sum, but this
      * way we dont have to worry about even/odd offsets...
      */
-    for(i=natoms;i<ni1+1+natoms/2;i++)
+    for (i = natoms; i < ni1+1+natoms/2; i++)
     {
         work[i-natoms] += work[i];
     }
-    
-    /* Parallel summations */
-       if(PARTDECOMP(cr))
-       {
-               gmx_sum(natoms,work,cr);
-       }
-       
+
+    /* Parallel summations would go here if ever implemented with DD */
+
     factor  = 0.5 * ONE_4PI_EPS0;
-       /* Calculate the radii - should we do all atoms, or just our local ones? */
-       for(i=0;i<natoms;i++)
-       {
-               if(born->use[i] != 0)
-               {
-                       gpi  = born->gpol[i]+work[i];
-                       gpi2 = gpi * gpi;
-                       born->bRad[i]   = factor*gmx_invsqrt(gpi2);
-                       fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
-               }
-       }
-       
-       return 0;
+    /* Calculate the radii - should we do all atoms, or just our local ones? */
+    for (i = 0; i < natoms; i++)
+    {
+        if (born->use[i] != 0)
+        {
+            gpi             = born->gpol[i]+work[i];
+            gpi2            = gpi * gpi;
+            born->bRad[i]   = factor*gmx_invsqrt(gpi2);
+            fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
+        }
+    }
+
+    return 0;
 }
 
 
@@ -1390,129 +1389,129 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
                                                 t_commrec *            cr,
                                                 void *                 paadata)
 {
-       gmx_allvsallgb2_data_t *aadata;
-       int        natoms;
-       int        ni0,ni1;
-       int        nj0,nj1,nj2,nj3;
-       int        i,j,k,n;
-    int *      mask;
-    int *      pmask0;
-    int *      pmask1;
-    int *      pmask2;
-    int *      pmask3;
-    int *      emask0;
-    int *      emask1;
-    int *      emask2;
-    int *      emask3;
-    real *     gb_radius;
-    real *     vsolv;
-    real *     work;
-    real       tmpsum[4];
-    real *     x_align;
-    real *     y_align;
-    real *     z_align;
-    int *      jindex;
-    real *     dadx;
-    real *     obc_param;
-    real       rad,min_rad;
-    real rai,rai_inv,rai_inv2,sum_ai,sum_ai2,sum_ai3,tsum,tchain;
-
-    __m128     ix_SSE0,iy_SSE0,iz_SSE0;
-    __m128     ix_SSE1,iy_SSE1,iz_SSE1;
-    __m128     ix_SSE2,iy_SSE2,iz_SSE2;
-    __m128     ix_SSE3,iy_SSE3,iz_SSE3;
-    __m128     gpi_SSE0,rai_SSE0,prod_ai_SSE0;
-    __m128     gpi_SSE1,rai_SSE1,prod_ai_SSE1;
-    __m128     gpi_SSE2,rai_SSE2,prod_ai_SSE2;
-    __m128     gpi_SSE3,rai_SSE3,prod_ai_SSE3;
-    __m128     imask_SSE0,jmask_SSE0;
-    __m128     imask_SSE1,jmask_SSE1;
-    __m128     imask_SSE2,jmask_SSE2;
-    __m128     imask_SSE3,jmask_SSE3;
-    __m128     jx_SSE,jy_SSE,jz_SSE;
-    __m128     dx_SSE0,dy_SSE0,dz_SSE0;
-    __m128     dx_SSE1,dy_SSE1,dz_SSE1;
-    __m128     dx_SSE2,dy_SSE2,dz_SSE2;
-    __m128     dx_SSE3,dy_SSE3,dz_SSE3;
-    __m128     rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
-    __m128     rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
-    __m128     rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
-    __m128     rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
-    __m128     raj_SSE,raj_inv_SSE,sk_aj_SSE,sk2_aj_SSE;
-    __m128     ccf_SSE0,dccf_SSE0,prod_SSE0;
-    __m128     ccf_SSE1,dccf_SSE1,prod_SSE1;
-    __m128     ccf_SSE2,dccf_SSE2,prod_SSE2;
-    __m128     ccf_SSE3,dccf_SSE3,prod_SSE3;
-    __m128     icf4_SSE0,icf6_SSE0;
-    __m128     icf4_SSE1,icf6_SSE1;
-    __m128     icf4_SSE2,icf6_SSE2;
-    __m128     icf4_SSE3,icf6_SSE3;
-    __m128     oneeighth_SSE,onefourth_SSE,half_SSE,one_SSE,two_SSE,four_SSE;
-    __m128     still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
-    __m128     rai_inv_SSE0;
-    __m128     rai_inv_SSE1;
-    __m128     rai_inv_SSE2;
-    __m128     rai_inv_SSE3;
-    __m128     sk_ai_SSE0,sk2_ai_SSE0,sum_ai_SSE0;
-    __m128     sk_ai_SSE1,sk2_ai_SSE1,sum_ai_SSE1;
-    __m128     sk_ai_SSE2,sk2_ai_SSE2,sum_ai_SSE2;
-    __m128     sk_ai_SSE3,sk2_ai_SSE3,sum_ai_SSE3;
-    __m128     lij_inv_SSE0,sk2_rinv_SSE0;
-    __m128     lij_inv_SSE1,sk2_rinv_SSE1;
-    __m128     lij_inv_SSE2,sk2_rinv_SSE2;
-    __m128     lij_inv_SSE3,sk2_rinv_SSE3;
-    __m128     dr_SSE0;
-    __m128     dr_SSE1;
-    __m128     dr_SSE2;
-    __m128     dr_SSE3;
-    __m128     t1_SSE0,t2_SSE0,t3_SSE0,t4_SSE0;
-    __m128     t1_SSE1,t2_SSE1,t3_SSE1,t4_SSE1;
-    __m128     t1_SSE2,t2_SSE2,t3_SSE2,t4_SSE2;
-    __m128     t1_SSE3,t2_SSE3,t3_SSE3,t4_SSE3;
-    __m128     obc_mask1_SSE0,obc_mask2_SSE0,obc_mask3_SSE0;
-    __m128     obc_mask1_SSE1,obc_mask2_SSE1,obc_mask3_SSE1;
-    __m128     obc_mask1_SSE2,obc_mask2_SSE2,obc_mask3_SSE2;
-    __m128     obc_mask1_SSE3,obc_mask2_SSE3,obc_mask3_SSE3;
-    __m128     uij_SSE0,uij2_SSE0,uij3_SSE0;
-    __m128     uij_SSE1,uij2_SSE1,uij3_SSE1;
-    __m128     uij_SSE2,uij2_SSE2,uij3_SSE2;
-    __m128     uij_SSE3,uij2_SSE3,uij3_SSE3;
-    __m128     lij_SSE0,lij2_SSE0,lij3_SSE0;
-    __m128     lij_SSE1,lij2_SSE1,lij3_SSE1;
-    __m128     lij_SSE2,lij2_SSE2,lij3_SSE2;
-    __m128     lij_SSE3,lij2_SSE3,lij3_SSE3;
-    __m128     dlij_SSE0,diff2_SSE0,logterm_SSE0;
-    __m128     dlij_SSE1,diff2_SSE1,logterm_SSE1;
-    __m128     dlij_SSE2,diff2_SSE2,logterm_SSE2;
-    __m128     dlij_SSE3,diff2_SSE3,logterm_SSE3;
-    __m128     doffset_SSE;
-    
+    gmx_allvsallgb2_data_t *aadata;
+    int                     natoms;
+    int                     ni0, ni1;
+    int                     nj0, nj1, nj2, nj3;
+    int                     i, j, k, n;
+    int              *      mask;
+    int              *      pmask0;
+    int              *      pmask1;
+    int              *      pmask2;
+    int              *      pmask3;
+    int              *      emask0;
+    int              *      emask1;
+    int              *      emask2;
+    int              *      emask3;
+    real              *     gb_radius;
+    real              *     vsolv;
+    real              *     work;
+    real                    tmpsum[4];
+    real              *     x_align;
+    real              *     y_align;
+    real              *     z_align;
+    int              *      jindex;
+    real              *     dadx;
+    real              *     obc_param;
+    real                    rad, min_rad;
+    real                    rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
+
+    __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
+    __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
+    __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
+    __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
+    __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
+    __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
+    __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
+    __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
+    __m128                  imask_SSE0, jmask_SSE0;
+    __m128                  imask_SSE1, jmask_SSE1;
+    __m128                  imask_SSE2, jmask_SSE2;
+    __m128                  imask_SSE3, jmask_SSE3;
+    __m128                  jx_SSE, jy_SSE, jz_SSE;
+    __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
+    __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
+    __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
+    __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
+    __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
+    __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
+    __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
+    __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
+    __m128                  raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
+    __m128                  ccf_SSE0, dccf_SSE0, prod_SSE0;
+    __m128                  ccf_SSE1, dccf_SSE1, prod_SSE1;
+    __m128                  ccf_SSE2, dccf_SSE2, prod_SSE2;
+    __m128                  ccf_SSE3, dccf_SSE3, prod_SSE3;
+    __m128                  icf4_SSE0, icf6_SSE0;
+    __m128                  icf4_SSE1, icf6_SSE1;
+    __m128                  icf4_SSE2, icf6_SSE2;
+    __m128                  icf4_SSE3, icf6_SSE3;
+    __m128                  oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
+    __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
+    __m128                  rai_inv_SSE0;
+    __m128                  rai_inv_SSE1;
+    __m128                  rai_inv_SSE2;
+    __m128                  rai_inv_SSE3;
+    __m128                  sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
+    __m128                  sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
+    __m128                  sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
+    __m128                  sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
+    __m128                  lij_inv_SSE0, sk2_rinv_SSE0;
+    __m128                  lij_inv_SSE1, sk2_rinv_SSE1;
+    __m128                  lij_inv_SSE2, sk2_rinv_SSE2;
+    __m128                  lij_inv_SSE3, sk2_rinv_SSE3;
+    __m128                  dr_SSE0;
+    __m128                  dr_SSE1;
+    __m128                  dr_SSE2;
+    __m128                  dr_SSE3;
+    __m128                  t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
+    __m128                  t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
+    __m128                  t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
+    __m128                  t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
+    __m128                  obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
+    __m128                  obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
+    __m128                  obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
+    __m128                  obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
+    __m128                  uij_SSE0, uij2_SSE0, uij3_SSE0;
+    __m128                  uij_SSE1, uij2_SSE1, uij3_SSE1;
+    __m128                  uij_SSE2, uij2_SSE2, uij3_SSE2;
+    __m128                  uij_SSE3, uij2_SSE3, uij3_SSE3;
+    __m128                  lij_SSE0, lij2_SSE0, lij3_SSE0;
+    __m128                  lij_SSE1, lij2_SSE1, lij3_SSE1;
+    __m128                  lij_SSE2, lij2_SSE2, lij3_SSE2;
+    __m128                  lij_SSE3, lij2_SSE3, lij3_SSE3;
+    __m128                  dlij_SSE0, diff2_SSE0, logterm_SSE0;
+    __m128                  dlij_SSE1, diff2_SSE1, logterm_SSE1;
+    __m128                  dlij_SSE2, diff2_SSE2, logterm_SSE2;
+    __m128                  dlij_SSE3, diff2_SSE3, logterm_SSE3;
+    __m128                  doffset_SSE;
+
     natoms              = mdatoms->nr;
-       ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
-       ni1                 = mdatoms->start+mdatoms->homenr;
-    
+    ni0                 = 0;
+    ni1                 = mdatoms->homenr;
+
     n = 0;
-    
+
     aadata = *((gmx_allvsallgb2_data_t **)paadata);
-    
-    
-       if(aadata==NULL)
-       {
-               genborn_allvsall_setup(&aadata,top,born,mdatoms,born->gb_doffset,
-                               egbOBC,TRUE,TRUE,TRUE);
+
+
+    if (aadata == NULL)
+    {
+        genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
+                               egbOBC, TRUE, TRUE, TRUE);
         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
-       }
-    
+    }
+
     x_align = aadata->x_align;
-       y_align = aadata->y_align;
-       z_align = aadata->z_align;
-    
+    y_align = aadata->y_align;
+    z_align = aadata->z_align;
+
     gb_radius = aadata->gb_radius;
     work      = aadata->work;
     jindex    = aadata->jindex_gb;
     dadx      = fr->dadx;
     obc_param = aadata->workparam;
-    
+
     oneeighth_SSE   = _mm_set1_ps(0.125);
     onefourth_SSE   = _mm_set1_ps(0.25);
     half_SSE        = _mm_set1_ps(0.5);
@@ -1520,45 +1519,45 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
     two_SSE         = _mm_set1_ps(2.0);
     four_SSE        = _mm_set1_ps(4.0);
     doffset_SSE     = _mm_set1_ps(born->gb_doffset);
-    
-    for(i=0;i<natoms;i++)
-       {
-               x_align[i]  = x[3*i];
-               y_align[i]  = x[3*i+1];
-               z_align[i]  = x[3*i+2];
-       }
-    
+
+    for (i = 0; i < natoms; i++)
+    {
+        x_align[i]  = x[3*i];
+        y_align[i]  = x[3*i+1];
+        z_align[i]  = x[3*i+2];
+    }
+
     /* Copy again */
-       for(i=0;i<natoms/2+1;i++)
-       {
-               x_align[natoms+i]  = x_align[i];
-               y_align[natoms+i]  = y_align[i];
-               z_align[natoms+i]  = z_align[i];
-    }        
-    
-    for(i=0;i<natoms+natoms/2+1;i++)
+    for (i = 0; i < natoms/2+1; i++)
+    {
+        x_align[natoms+i]  = x_align[i];
+        y_align[natoms+i]  = y_align[i];
+        z_align[natoms+i]  = z_align[i];
+    }
+
+    for (i = 0; i < natoms+natoms/2+1; i++)
     {
         work[i] = 0;
     }
-        
-       for(i=ni0; i<ni1; i+= UNROLLI)
-       {
-               /* We assume shifts are NOT used for all-vs-all interactions */
-        
-               /* Load i atom data */
-               ix_SSE0          = _mm_load1_ps(x_align+i);
-               iy_SSE0          = _mm_load1_ps(y_align+i);
-               iz_SSE0          = _mm_load1_ps(z_align+i);
-               ix_SSE1          = _mm_load1_ps(x_align+i+1);
-               iy_SSE1          = _mm_load1_ps(y_align+i+1);
-               iz_SSE1          = _mm_load1_ps(z_align+i+1);
-               ix_SSE2          = _mm_load1_ps(x_align+i+2);
-               iy_SSE2          = _mm_load1_ps(y_align+i+2);
-               iz_SSE2          = _mm_load1_ps(z_align+i+2);
-               ix_SSE3          = _mm_load1_ps(x_align+i+3);
-               iy_SSE3          = _mm_load1_ps(y_align+i+3);
-               iz_SSE3          = _mm_load1_ps(z_align+i+3);
-        
+
+    for (i = ni0; i < ni1; i += UNROLLI)
+    {
+        /* We assume shifts are NOT used for all-vs-all interactions */
+
+        /* Load i atom data */
+        ix_SSE0          = _mm_load1_ps(x_align+i);
+        iy_SSE0          = _mm_load1_ps(y_align+i);
+        iz_SSE0          = _mm_load1_ps(z_align+i);
+        ix_SSE1          = _mm_load1_ps(x_align+i+1);
+        iy_SSE1          = _mm_load1_ps(y_align+i+1);
+        iz_SSE1          = _mm_load1_ps(z_align+i+1);
+        ix_SSE2          = _mm_load1_ps(x_align+i+2);
+        iy_SSE2          = _mm_load1_ps(y_align+i+2);
+        iz_SSE2          = _mm_load1_ps(z_align+i+2);
+        ix_SSE3          = _mm_load1_ps(x_align+i+3);
+        iy_SSE3          = _mm_load1_ps(y_align+i+3);
+        iz_SSE3          = _mm_load1_ps(z_align+i+3);
+
         rai_SSE0         = _mm_load1_ps(gb_radius+i);
         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
@@ -1567,118 +1566,118 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
         rai_inv_SSE1     = gmx_mm_inv_ps(rai_SSE1);
         rai_inv_SSE2     = gmx_mm_inv_ps(rai_SSE2);
         rai_inv_SSE3     = gmx_mm_inv_ps(rai_SSE3);
-        
+
         sk_ai_SSE0       = _mm_load1_ps(obc_param+i);
         sk_ai_SSE1       = _mm_load1_ps(obc_param+i+1);
         sk_ai_SSE2       = _mm_load1_ps(obc_param+i+2);
         sk_ai_SSE3       = _mm_load1_ps(obc_param+i+3);
-        sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0,sk_ai_SSE0);
-        sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1,sk_ai_SSE1);
-        sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2,sk_ai_SSE2);
-        sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3,sk_ai_SSE3);
-        
+        sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
+        sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
+        sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
+        sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
+
         sum_ai_SSE0      = _mm_setzero_ps();
         sum_ai_SSE1      = _mm_setzero_ps();
         sum_ai_SSE2      = _mm_setzero_ps();
         sum_ai_SSE3      = _mm_setzero_ps();
-        
-               /* Load limits for loop over neighbors */
-               nj0              = jindex[4*i];
-               nj1              = jindex[4*i+1];
-               nj2              = jindex[4*i+2];
-               nj3              = jindex[4*i+3];
-        
+
+        /* Load limits for loop over neighbors */
+        nj0              = jindex[4*i];
+        nj1              = jindex[4*i+1];
+        nj2              = jindex[4*i+2];
+        nj3              = jindex[4*i+3];
+
         pmask0           = aadata->prologue_mask_gb[i];
         pmask1           = aadata->prologue_mask_gb[i+1];
         pmask2           = aadata->prologue_mask_gb[i+2];
         pmask3           = aadata->prologue_mask_gb[i+3];
-        emask0           = aadata->epilogue_mask[i]; 
-        emask1           = aadata->epilogue_mask[i+1]; 
-        emask2           = aadata->epilogue_mask[i+2]; 
-        emask3           = aadata->epilogue_mask[i+3]; 
-        
+        emask0           = aadata->epilogue_mask[i];
+        emask1           = aadata->epilogue_mask[i+1];
+        emask2           = aadata->epilogue_mask[i+2];
+        emask3           = aadata->epilogue_mask[i+3];
+
         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
-        
+
         /* Prologue part, including exclusion mask */
-        for(j=nj0; j<nj1; j+=UNROLLJ)
-        {          
+        for (j = nj0; j < nj1; j += UNROLLJ)
+        {
             jmask_SSE0 = _mm_load_ps((real *)pmask0);
             jmask_SSE1 = _mm_load_ps((real *)pmask1);
             jmask_SSE2 = _mm_load_ps((real *)pmask2);
             jmask_SSE3 = _mm_load_ps((real *)pmask3);
-            pmask0 += UNROLLJ;
-            pmask1 += UNROLLJ;
-            pmask2 += UNROLLJ;
-            pmask3 += UNROLLJ;
-            
+            pmask0    += UNROLLJ;
+            pmask1    += UNROLLJ;
+            pmask2    += UNROLLJ;
+            pmask3    += UNROLLJ;
+
             /* load j atom coordinates */
             jx_SSE            = _mm_load_ps(x_align+j);
             jy_SSE            = _mm_load_ps(y_align+j);
             jz_SSE            = _mm_load_ps(z_align+j);
-            
+
             /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
-            
+            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
+            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
+            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
+            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
+            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
+            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
+            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
+            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
+            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
+            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
+            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
+            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
+
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
-            
+            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
+            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
+            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
+            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+
             /* Combine masks */
-            jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
-            jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
-            jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
-            jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
-            
+            jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
+            jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
+            jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
+            jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
+
             /* Calculate 1/r and 1/r2 */
             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-            
+
             /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
-            
-            dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
-            dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
-            dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
-            dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
+            rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
+            rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
+            rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
+            rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
+
+            dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
+            dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
+            dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
+            dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
 
             sk_aj_SSE          = _mm_load_ps(obc_param+j);
             raj_SSE            = _mm_load_ps(gb_radius+j);
             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
-            
+
             /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
-            t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
-            t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
-            t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
-            t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
+            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
+            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
+            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
+            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
+            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
+            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
+            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
+            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
+            t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
+            t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
+            t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
+            t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
 
             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
@@ -1692,221 +1691,221 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
+            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
+            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
+            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
+            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
 
             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
-                                           _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
-                                           _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
-                                           _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
-                                           _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
-            dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
-            
+            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
+                                              _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
+            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
+                                              _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
+            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
+                                              _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
+            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
+                                              _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
+            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
+            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
+            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
+            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
+
             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
+            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
+            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
+            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
+            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
-            
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
+            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
+            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
+            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
+            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
+
+            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
+            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
+            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
+            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
-            
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
-
-            t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
+            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
+            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
+            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
+            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
+            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
+            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
+            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
+            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
+            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
+
+            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
+            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
+            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
+            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
+
+            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
+            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
+            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
+            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
                                                        prod_SSE0));
             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
                                                        prod_SSE1));
             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
                                                        prod_SSE2));
             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
                                                        prod_SSE3));
-            
-            t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
-            
-            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
-            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
-            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
-            
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0,lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1,lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2,lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3,lij3_SSE3));
+
+            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
+            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
+            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
+            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
+            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
+            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
+            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
+            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
+            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
+            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
+            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
+            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
+            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
+            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
+            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
+            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
+            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
+            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
+            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
+            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
+
+            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
+            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
+            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
+            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
+
+            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
+                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
+            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
+                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
+            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
+                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
+            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
+                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
             t1_SSE0            = _mm_sub_ps(t1_SSE0,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0,dr_SSE0))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
+                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
             t1_SSE1            = _mm_sub_ps(t1_SSE1,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1,dr_SSE1))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
+                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
             t1_SSE2            = _mm_sub_ps(t1_SSE2,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2,dr_SSE2))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
+                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
             t1_SSE3            = _mm_sub_ps(t1_SSE3,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3,dr_SSE3))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
+                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
 
             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0,dr_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
+                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1,dr_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
+                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2,dr_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
+                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3,dr_SSE3)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
+                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
             t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0,uij3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
+                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
             t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1,uij3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
+                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
             t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2,uij3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
+                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
             t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3,uij3_SSE3)));
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0,rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1,rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2,rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3,rinv_SSE3));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
+                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
+            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
+                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
+            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
+                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
+            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
+                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
+            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
+                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
             t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
             t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
             t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
             t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
-            
+                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
+
             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0,t3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
+                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1,t3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
+                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2,t3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
+                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3,t3_SSE3)));
-            
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
+                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
+
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
             dadx += 4;
-            
+
             /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
-            t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
-            t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
-            t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
-            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
-            
+            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
+            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
+            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
+            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
+            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
+            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
+            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
+            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
+            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
+            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
+            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
+            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
+
             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
@@ -1919,273 +1918,273 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
-            
+            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
+            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
+            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
+            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
+
             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
-                                           _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
-                                           _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
-                                           _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
-                                           _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
-            dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
-            
+            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
+                                              _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
+            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
+                                              _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
+            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
+                                              _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
+            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
+                                              _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
+            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
+            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
+            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
+            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
+
             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
+            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
+            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
+            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
+            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
-            
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
+            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
+            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
+            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
+            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
+
+            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
+            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
+            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
+            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
-            
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
-            t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
+            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
+            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
+            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
+            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
+            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
+            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
+            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
+            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
+
+            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
+            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
+            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
+            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
+            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
+            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
+            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
+            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
                                                        prod_SSE0));
             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
                                                        prod_SSE1));
             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
                                                        prod_SSE2));
             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
                                                        prod_SSE3));
-            t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
-            
+            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
+            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
+            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
+            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
+            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
+            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
+            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
+            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
+            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
+            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
+            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
+            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
+            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
+            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
+            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
+            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
+            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
+            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
+            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
+            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
+
             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
-                                                        _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
-                                                        _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
-                                                        _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
-            
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0,lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1,lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2,lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3,lij3_SSE3));
+                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
+                                                           _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
+                                                           _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
+                                                           _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
+
+            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
+                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
+            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
+                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
+            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
+                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
+            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
+                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
             t1_SSE0            = _mm_sub_ps(t1_SSE0,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0,dr_SSE0))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
+                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
             t1_SSE1            = _mm_sub_ps(t1_SSE1,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1,dr_SSE1))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
+                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
             t1_SSE2            = _mm_sub_ps(t1_SSE2,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2,dr_SSE2))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
+                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
             t1_SSE3            = _mm_sub_ps(t1_SSE3,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3,dr_SSE3))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
+                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0,dr_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
+                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1,dr_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
+                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2,dr_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
+                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3,dr_SSE3)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
+                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
             t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0,uij3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
+                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
             t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1,uij3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
+                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
             t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2,uij3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
+                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
             t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3,uij3_SSE3)));
-            
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0,rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1,rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2,rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3,rinv_SSE3));
-            
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
+                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
+
+            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
+                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
+            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
+                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
+            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
+                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
+            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
+                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
+
             t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
             t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
             t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
             t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
-            
-            
+                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
+
+
             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0,t3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
+                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1,t3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
+                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2,t3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
+                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3,t3_SSE3)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
+                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
 
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
             dadx += 4;
-            
+
         }
-        
+
         /* Main part, no exclusions */
-        for(j=nj1; j<nj2; j+=UNROLLJ)
-        {                      
+        for (j = nj1; j < nj2; j += UNROLLJ)
+        {
             /* load j atom coordinates */
             jx_SSE            = _mm_load_ps(x_align+j);
             jy_SSE            = _mm_load_ps(y_align+j);
             jz_SSE            = _mm_load_ps(z_align+j);
-            
+
             /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
-            
+            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
+            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
+            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
+            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
+            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
+            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
+            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
+            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
+            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
+            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
+            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
+            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
+
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
-            
+            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
+            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
+            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
+            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+
             /* Calculate 1/r and 1/r2 */
             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-            
+
             /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0,imask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1,imask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2,imask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3,imask_SSE3);
-            
-            dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
-            dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
-            dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
-            dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
-            
+            rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
+            rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
+            rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
+            rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
+
+            dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
+            dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
+            dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
+            dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
+
             sk_aj_SSE          = _mm_load_ps(obc_param+j);
             raj_SSE            = _mm_load_ps(gb_radius+j);
 
             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
-            
+
             /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
-            t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
-            t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
-            t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
-            t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
-            
+            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
+            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
+            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
+            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
+            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
+            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
+            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
+            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
+            t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
+            t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
+            t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
+            t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
+
             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
@@ -2198,221 +2197,221 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
-            
+            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
+            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
+            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
+            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
+
             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
-                                           _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
-                                           _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
-                                           _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
-                                           _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
-            dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
-            
+            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
+                                              _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
+            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
+                                              _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
+            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
+                                              _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
+            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
+                                              _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
+            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
+            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
+            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
+            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
+
             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
+            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
+            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
+            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
+            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
-            
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
+            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
+            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
+            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
+            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
+
+            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
+            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
+            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
+            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
-            
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
-            
-            t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
+            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
+            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
+            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
+            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
+            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
+            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
+            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
+            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
+            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
+
+            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
+            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
+            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
+            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
+
+            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
+            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
+            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
+            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
                                                        prod_SSE0));
             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
                                                        prod_SSE1));
             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
                                                        prod_SSE2));
             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
                                                        prod_SSE3));
-            
-            t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
-            
-            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
-            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
-            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
-            
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0,lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1,lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2,lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3,lij3_SSE3));
+
+            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
+            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
+            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
+            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
+            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
+            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
+            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
+            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
+            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
+            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
+            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
+            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
+            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
+            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
+            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
+            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
+            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
+            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
+            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
+            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
+
+            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
+            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
+            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
+            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
+
+            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
+                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
+            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
+                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
+            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
+                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
+            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
+                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
             t1_SSE0            = _mm_sub_ps(t1_SSE0,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0,dr_SSE0))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
+                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
             t1_SSE1            = _mm_sub_ps(t1_SSE1,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1,dr_SSE1))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
+                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
             t1_SSE2            = _mm_sub_ps(t1_SSE2,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2,dr_SSE2))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
+                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
             t1_SSE3            = _mm_sub_ps(t1_SSE3,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3,dr_SSE3))));
-            
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
+                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
+
             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0,dr_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
+                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1,dr_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
+                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2,dr_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
+                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3,dr_SSE3)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
+                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
             t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0,uij3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
+                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
             t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1,uij3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
+                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
             t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2,uij3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
+                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
             t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3,uij3_SSE3)));
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0,rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1,rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2,rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3,rinv_SSE3));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
+                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
+            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
+                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
+            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
+                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
+            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
+                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
+            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
+                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
             t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
             t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
             t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
             t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
-            
+                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
+
             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0,t3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
+                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1,t3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
+                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2,t3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
+                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3,t3_SSE3)));
-            
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
+                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
+
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
             dadx += 4;
-            
+
             /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
-            t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
-            t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
-            t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
-            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
-            
+            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
+            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
+            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
+            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
+            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
+            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
+            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
+            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
+            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
+            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
+            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
+            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
+
             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
@@ -2425,286 +2424,286 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
-            
+            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
+            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
+            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
+            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
+
             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
-                                           _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
-                                           _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
-                                           _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
-                                           _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
-            dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
-            
+            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
+                                              _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
+            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
+                                              _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
+            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
+                                              _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
+            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
+                                              _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
+            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
+            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
+            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
+            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
+
             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
+            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
+            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
+            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
+            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
-            
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
+            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
+            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
+            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
+            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
+
+            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
+            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
+            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
+            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
-            
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
-            t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
+            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
+            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
+            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
+            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
+            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
+            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
+            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
+            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
+
+            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
+            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
+            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
+            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
+            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
+            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
+            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
+            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
                                                        prod_SSE0));
             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
                                                        prod_SSE1));
             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
                                                        prod_SSE2));
             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
                                                        prod_SSE3));
-            t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
-            
+            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
+            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
+            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
+            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
+            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
+            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
+            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
+            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
+            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
+            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
+            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
+            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
+            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
+            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
+            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
+            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
+            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
+            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
+            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
+            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
+
             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
-                                                        _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
-                                                        _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
-                                                        _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
-            
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0,lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1,lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2,lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3,lij3_SSE3));
+                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
+                                                           _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
+                                                           _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
+                                                           _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
+
+            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
+                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
+            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
+                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
+            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
+                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
+            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
+                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
             t1_SSE0            = _mm_sub_ps(t1_SSE0,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0,dr_SSE0))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
+                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
             t1_SSE1            = _mm_sub_ps(t1_SSE1,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1,dr_SSE1))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
+                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
             t1_SSE2            = _mm_sub_ps(t1_SSE2,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2,dr_SSE2))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
+                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
             t1_SSE3            = _mm_sub_ps(t1_SSE3,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3,dr_SSE3))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
+                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0,dr_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
+                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1,dr_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
+                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2,dr_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
+                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3,dr_SSE3)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
+                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
             t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0,uij3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
+                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
             t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1,uij3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
+                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
             t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2,uij3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
+                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
             t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3,uij3_SSE3)));
-            
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0,rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1,rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2,rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3,rinv_SSE3));
-            
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
+                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
+
+            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
+                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
+            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
+                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
+            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
+                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
+            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
+                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
+
             t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
             t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
             t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
             t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
-            
+                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
+
             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0,t3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
+                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1,t3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
+                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2,t3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
+                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3,t3_SSE3)));
-            
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
+                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
+
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
             dadx += 4;
         }
 
         /* Epilogue part, including exclusion mask */
-        for(j=nj2; j<nj3; j+=UNROLLJ)
-        {                      
+        for (j = nj2; j < nj3; j += UNROLLJ)
+        {
             jmask_SSE0 = _mm_load_ps((real *)emask0);
             jmask_SSE1 = _mm_load_ps((real *)emask1);
             jmask_SSE2 = _mm_load_ps((real *)emask2);
             jmask_SSE3 = _mm_load_ps((real *)emask3);
-            emask0 += UNROLLJ;
-            emask1 += UNROLLJ;
-            emask2 += UNROLLJ;
-            emask3 += UNROLLJ;
-            
+            emask0    += UNROLLJ;
+            emask1    += UNROLLJ;
+            emask2    += UNROLLJ;
+            emask3    += UNROLLJ;
+
             /* load j atom coordinates */
             jx_SSE            = _mm_load_ps(x_align+j);
             jy_SSE            = _mm_load_ps(y_align+j);
             jz_SSE            = _mm_load_ps(z_align+j);
-            
+
             /* Calculate distance */
-            dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
-            dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
-            dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
-            dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
-            dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
-            dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
-            dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
-            dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
-            dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
-            dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
-            dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
-            dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
-            
+            dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
+            dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
+            dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
+            dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
+            dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
+            dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
+            dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
+            dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
+            dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
+            dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
+            dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
+            dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
+
             /* rsq = dx*dx+dy*dy+dz*dz */
-            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
-            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
-            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
-            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
-            
+            rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
+            rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
+            rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
+            rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
+
             /* Combine masks */
-            jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
-            jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
-            jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
-            jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
-            
+            jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
+            jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
+            jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
+            jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
+
             /* Calculate 1/r and 1/r2 */
             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
-            
+
             /* Apply mask */
-            rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
-            rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
-            rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
-            rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
-            
-            dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
-            dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
-            dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
-            dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
-            
+            rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
+            rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
+            rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
+            rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
+
+            dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
+            dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
+            dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
+            dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
+
             sk_aj_SSE          = _mm_load_ps(obc_param+j);
             raj_SSE            = _mm_load_ps(gb_radius+j);
 
             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
-            
+
             /* Evaluate influence of atom aj -> ai */
-            t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
-            t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
-            t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
-            t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
-            t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
-            
+            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
+            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
+            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
+            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
+            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
+            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
+            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
+            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
+            t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
+            t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
+            t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
+            t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
+
             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
@@ -2717,221 +2716,221 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
-            
+            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
+            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
+            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
+            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
+
             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
-                                           _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
-                                           _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
-                                           _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
-                                           _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
-            dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
-            
+            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
+                                              _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
+            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
+                                              _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
+            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
+                                              _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
+            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
+                                              _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
+            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
+            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
+            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
+            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
+
             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
+            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
+            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
+            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
+            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
-            
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
+            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
+            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
+            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
+            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
+
+            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
+            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
+            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
+            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
-            
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
-            
-            t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
+            sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
+            sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
+            sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
+            sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
+            sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
+            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
+            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
+            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
+            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
+
+            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
+            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
+            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
+            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
+
+            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
+            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
+            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
+            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
                                                        prod_SSE0));
             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
                                                        prod_SSE1));
             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
                                                        prod_SSE2));
             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
                                                        prod_SSE3));
-            
-            t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
-            
-            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
-            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
-            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
-            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
-            
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0,lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1,lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2,lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3,lij3_SSE3));
+
+            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
+            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
+            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
+            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
+            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
+            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
+            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
+            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
+            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
+            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
+            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
+            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
+            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
+            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
+            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
+            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
+            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
+            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
+            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
+            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
+
+            sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
+            sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
+            sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
+            sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
+
+            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
+                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
+            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
+                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
+            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
+                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
+            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
+                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
             t1_SSE0            = _mm_sub_ps(t1_SSE0,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0,dr_SSE0))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
+                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
             t1_SSE1            = _mm_sub_ps(t1_SSE1,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1,dr_SSE1))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
+                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
             t1_SSE2            = _mm_sub_ps(t1_SSE2,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2,dr_SSE2))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
+                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
             t1_SSE3            = _mm_sub_ps(t1_SSE3,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3,dr_SSE3))));
-            
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
+                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
+
             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0,dr_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
+                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1,dr_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
+                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2,dr_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
+                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3,dr_SSE3)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
+                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
             t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0,uij3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
+                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
             t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1,uij3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
+                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
             t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2,uij3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
+                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
             t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3,uij3_SSE3)));
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0,rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1,rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2,rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3,rinv_SSE3));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
+                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
+            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
+                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
+            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
+                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
+            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
+                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
+            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
+                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
             t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
             t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
             t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
             t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
-            
+                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
+
             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0,t3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
+                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1,t3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
+                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2,t3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
+                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3,t3_SSE3)));
-            
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
+                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
+
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
             dadx += 4;
-            
+
             /* Evaluate influence of atom ai -> aj */
-            t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
-            t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
-            t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
-            t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
-            t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
-            t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
-            t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
-            t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
-            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
-            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
-            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
-            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
-            
+            t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
+            t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
+            t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
+            t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
+            t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
+            t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
+            t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
+            t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
+            t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
+            t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
+            t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
+            t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
+
             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
@@ -2944,276 +2943,271 @@ genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
-            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
-            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
-            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
-            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
-            
+            obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
+            obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
+            obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
+            obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
+
             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
-            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
-                                           _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
-            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
-                                           _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
-            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
-                                           _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
-            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
-                                           _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
-            dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
-            dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
-            dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
-            dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
-            
+            lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
+                                              _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
+            lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
+                                              _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
+            lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
+                                              _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
+            lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
+                                              _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
+            dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
+            dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
+            dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
+            dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
+
             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
-            uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
-            uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
-            uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
-            uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
+            uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
+            uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
+            uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
+            uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
-            lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
-            lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
-            lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
-            lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
-            
-            diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
-            diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
-            diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
-            diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
+            lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
+            lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
+            lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
+            lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
+
+            diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
+            diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
+            diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
+            diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
-            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
-            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
-            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
-            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
-            prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
-            prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
-            prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
-            prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
-            
-            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
-            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
-            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
-            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
-            t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
-            t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
-            t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
-            t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
+            sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
+            sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
+            sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
+            sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
+            prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
+            prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
+            prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
+            prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
+
+            logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
+            logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
+            logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
+            logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
+            t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
+            t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
+            t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
+            t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
                                                        prod_SSE0));
             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
                                                        prod_SSE1));
             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
                                                        prod_SSE2));
             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
-                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
+                                            _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
                                                        prod_SSE3));
-            t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
-            t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
-            t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
-            t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
-            t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
-            t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
-            t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
-            t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
-            t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
-            t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
-            t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
-            t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
-            t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
-            t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
-            t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
-            t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
-            t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
-            t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
-            t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
-            t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
-            
+            t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
+            t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
+            t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
+            t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
+            t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
+            t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
+            t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
+            t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
+            t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
+            t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
+            t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
+            t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
+            t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
+            t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
+            t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
+            t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
+            t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
+            t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
+            t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
+            t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
+
             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
-                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
-                                                        _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
-                                                        _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
-                                                        _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
-            
-            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
-                                            _mm_mul_ps(prod_SSE0,lij3_SSE0));
-            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
-                                            _mm_mul_ps(prod_SSE1,lij3_SSE1));
-            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
-                                            _mm_mul_ps(prod_SSE2,lij3_SSE2));
-            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
-                                            _mm_mul_ps(prod_SSE3,lij3_SSE3));
+                                            gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
+                                                           _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
+                                                           _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
+                                                           _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
+
+            t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
+                                            _mm_mul_ps(prod_SSE0, lij3_SSE0));
+            t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
+                                            _mm_mul_ps(prod_SSE1, lij3_SSE1));
+            t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
+                                            _mm_mul_ps(prod_SSE2, lij3_SSE2));
+            t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
+                                            _mm_mul_ps(prod_SSE3, lij3_SSE3));
             t1_SSE0            = _mm_sub_ps(t1_SSE0,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
-                                                                  _mm_mul_ps(lij3_SSE0,dr_SSE0))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
+                                                                  _mm_mul_ps(lij3_SSE0, dr_SSE0))));
             t1_SSE1            = _mm_sub_ps(t1_SSE1,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
-                                                                  _mm_mul_ps(lij3_SSE1,dr_SSE1))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
+                                                                  _mm_mul_ps(lij3_SSE1, dr_SSE1))));
             t1_SSE2            = _mm_sub_ps(t1_SSE2,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
-                                                                  _mm_mul_ps(lij3_SSE2,dr_SSE2))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
+                                                                  _mm_mul_ps(lij3_SSE2, dr_SSE2))));
             t1_SSE3            = _mm_sub_ps(t1_SSE3,
                                             _mm_mul_ps(onefourth_SSE,
-                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
-                                                                  _mm_mul_ps(lij3_SSE3,dr_SSE3))));
+                                                       _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
+                                                                  _mm_mul_ps(lij3_SSE3, dr_SSE3))));
             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
-                                                       _mm_mul_ps(uij3_SSE0,dr_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
+                                                       _mm_mul_ps(uij3_SSE0, dr_SSE0)));
             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
-                                                       _mm_mul_ps(uij3_SSE1,dr_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
+                                                       _mm_mul_ps(uij3_SSE1, dr_SSE1)));
             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
-                                                       _mm_mul_ps(uij3_SSE2,dr_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
+                                                       _mm_mul_ps(uij3_SSE2, dr_SSE2)));
             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
-                                            _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
-                                                       _mm_mul_ps(uij3_SSE3,dr_SSE3)));
+                                            _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
+                                                       _mm_mul_ps(uij3_SSE3, dr_SSE3)));
             t2_SSE0            = _mm_sub_ps(t2_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
-                                                       _mm_mul_ps(prod_SSE0,uij3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
+                                                       _mm_mul_ps(prod_SSE0, uij3_SSE0)));
             t2_SSE1            = _mm_sub_ps(t2_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
-                                                       _mm_mul_ps(prod_SSE1,uij3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
+                                                       _mm_mul_ps(prod_SSE1, uij3_SSE1)));
             t2_SSE2            = _mm_sub_ps(t2_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
-                                                       _mm_mul_ps(prod_SSE2,uij3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
+                                                       _mm_mul_ps(prod_SSE2, uij3_SSE2)));
             t2_SSE3            = _mm_sub_ps(t2_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
-                                                       _mm_mul_ps(prod_SSE3,uij3_SSE3)));
-            
-            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
-                                            _mm_mul_ps(rinv_SSE0,rinv_SSE0));
-            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
-                                            _mm_mul_ps(rinv_SSE1,rinv_SSE1));
-            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
-                                            _mm_mul_ps(rinv_SSE2,rinv_SSE2));
-            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
-                                            _mm_mul_ps(rinv_SSE3,rinv_SSE3));
-            
+                                            _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
+                                                       _mm_mul_ps(prod_SSE3, uij3_SSE3)));
+
+            t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
+                                            _mm_mul_ps(rinv_SSE0, rinv_SSE0));
+            t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
+                                            _mm_mul_ps(rinv_SSE1, rinv_SSE1));
+            t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
+                                            _mm_mul_ps(rinv_SSE2, rinv_SSE2));
+            t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
+                                            _mm_mul_ps(rinv_SSE3, rinv_SSE3));
+
             t3_SSE0            = _mm_sub_ps(t3_SSE0,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
             t3_SSE1            = _mm_sub_ps(t3_SSE1,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
             t3_SSE2            = _mm_sub_ps(t3_SSE2,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
+                                                                  _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
             t3_SSE3            = _mm_sub_ps(t3_SSE3,
-                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
+                                            _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
                                                        _mm_add_ps(one_SSE,
-                                                                  _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
-            
-            
+                                                                  _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
+
+
             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
-                                                       _mm_add_ps(t2_SSE0,t3_SSE0)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
+                                                       _mm_add_ps(t2_SSE0, t3_SSE0)));
             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
-                                                       _mm_add_ps(t2_SSE1,t3_SSE1)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
+                                                       _mm_add_ps(t2_SSE1, t3_SSE1)));
             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
-                                                       _mm_add_ps(t2_SSE2,t3_SSE2)));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
+                                                       _mm_add_ps(t2_SSE2, t3_SSE2)));
             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
-                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
-                                                       _mm_add_ps(t2_SSE3,t3_SSE3)));
-            
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
+                                            _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
+                                                       _mm_add_ps(t2_SSE3, t3_SSE3)));
+
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
             dadx += 4;
-            _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
+            _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
             dadx += 4;
         }
-        _MM_TRANSPOSE4_PS(sum_ai_SSE0,sum_ai_SSE1,sum_ai_SSE2,sum_ai_SSE3);
-        sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE1);
-        sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,sum_ai_SSE3);
-        sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE2);
+        _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
+        sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
+        sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
+        sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
         _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
-       }    
-    
-    
-    for(i=0;i<natoms/2+1;i++)
+    }
+
+
+    for (i = 0; i < natoms/2+1; i++)
     {
         work[i] += work[natoms+i];
     }
-    
-    /* Parallel summations */
-    
-       if(PARTDECOMP(cr))
-       {
-               gmx_sum(natoms,work, cr);
-       }
-       
-    if(gb_algorithm==egbHCT)
+
+    /* Parallel summations would go here if ever implemented with DD */
+
+    if (gb_algorithm == egbHCT)
     {
         /* HCT */
-        for(i=0;i<natoms;i++)
+        for (i = 0; i < natoms; i++)
         {
-            if(born->use[i] != 0)
+            if (born->use[i] != 0)
             {
-                rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset; 
+                rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
                 sum_ai  = 1.0/rai - work[i];
                 min_rad = rai + born->gb_doffset;
-                rad     = 1.0/sum_ai; 
-                
+                rad     = 1.0/sum_ai;
+
                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
             }
         }
-        
+
     }
     else
     {
         /* OBC */
-        
-       /* Calculate the radii */
-        for(i=0;i<natoms;i++)
+
+        /* Calculate the radii */
+        for (i = 0; i < natoms; i++)
         {
-            
-            if(born->use[i] != 0)
+
+            if (born->use[i] != 0)
             {
                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
                 rai_inv2   = 1.0/rai;
-                rai        = rai-born->gb_doffset; 
+                rai        = rai-born->gb_doffset;
                 rai_inv    = 1.0/rai;
                 sum_ai     = rai * work[i];
                 sum_ai2    = sum_ai  * sum_ai;
                 sum_ai3    = sum_ai2 * sum_ai;
-                
-                tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+
+                tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
                 born->bRad[i] = rai_inv - tsum*rai_inv2;
                 born->bRad[i] = 1.0 / born->bRad[i];
-                
-                fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
-                
-                tchain  = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+
+                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
+
+                tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
             }
         }
     }
-       
-       return 0;
+
+    return 0;
 }
 
 
@@ -3232,270 +3226,270 @@ genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
                                             int                    gb_algorithm,
                                             void *                 paadata)
 {
-       gmx_allvsallgb2_data_t *aadata;
-       int        natoms;
-       int        ni0,ni1;
-       int        nj0,nj1,nj2,nj3;
-       int        i,j,k,n;
-    int        idx;
-    int *      mask;
-    int *      pmask0;
-    int *      emask0;
-    int *      jindex;
-
-    real       ix,iy,iz;
-    real       fix,fiy,fiz;
-    real       jx,jy,jz;
-    real       dx,dy,dz;
-    real       tx,ty,tz;
-    real       rbai,rbaj,fgb,fgb_ai,rbi;
-    real *     rb;
-    real *     dadx;
-    real *     x_align;
-    real *     y_align;
-    real *     z_align;
-    real *     fx_align;
-    real *     fy_align;
-    real *     fz_align;
-    real       tmpsum[4];
-    
-    __m128     jmask_SSE0,jmask_SSE1,jmask_SSE2,jmask_SSE3;
-    __m128     ix_SSE0,iy_SSE0,iz_SSE0;
-    __m128     ix_SSE1,iy_SSE1,iz_SSE1;
-    __m128     ix_SSE2,iy_SSE2,iz_SSE2;
-    __m128     ix_SSE3,iy_SSE3,iz_SSE3;
-    __m128     fix_SSE0,fiy_SSE0,fiz_SSE0;
-    __m128     fix_SSE1,fiy_SSE1,fiz_SSE1;
-    __m128     fix_SSE2,fiy_SSE2,fiz_SSE2;
-    __m128     fix_SSE3,fiy_SSE3,fiz_SSE3;
-    __m128     rbai_SSE0,rbai_SSE1,rbai_SSE2,rbai_SSE3;
-    __m128     imask_SSE0,imask_SSE1,imask_SSE2,imask_SSE3;
-    __m128     jx_SSE,jy_SSE,jz_SSE,rbaj_SSE;
-    __m128     dx_SSE0,dy_SSE0,dz_SSE0;
-    __m128     dx_SSE1,dy_SSE1,dz_SSE1;
-    __m128     dx_SSE2,dy_SSE2,dz_SSE2;
-    __m128     dx_SSE3,dy_SSE3,dz_SSE3;
-    __m128     fgb_SSE0,fgb_ai_SSE0;
-    __m128     fgb_SSE1,fgb_ai_SSE1;
-    __m128     fgb_SSE2,fgb_ai_SSE2;
-    __m128     fgb_SSE3,fgb_ai_SSE3;
-    __m128     tx_SSE0,ty_SSE0,tz_SSE0;
-    __m128     tx_SSE1,ty_SSE1,tz_SSE1;
-    __m128     tx_SSE2,ty_SSE2,tz_SSE2;
-    __m128     tx_SSE3,ty_SSE3,tz_SSE3;
-    __m128     t1,t2;
-    
+    gmx_allvsallgb2_data_t *aadata;
+    int                     natoms;
+    int                     ni0, ni1;
+    int                     nj0, nj1, nj2, nj3;
+    int                     i, j, k, n;
+    int                     idx;
+    int              *      mask;
+    int              *      pmask0;
+    int              *      emask0;
+    int              *      jindex;
+
+    real                    ix, iy, iz;
+    real                    fix, fiy, fiz;
+    real                    jx, jy, jz;
+    real                    dx, dy, dz;
+    real                    tx, ty, tz;
+    real                    rbai, rbaj, fgb, fgb_ai, rbi;
+    real              *     rb;
+    real              *     dadx;
+    real              *     x_align;
+    real              *     y_align;
+    real              *     z_align;
+    real              *     fx_align;
+    real              *     fy_align;
+    real              *     fz_align;
+    real                    tmpsum[4];
+
+    __m128                  jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
+    __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
+    __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
+    __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
+    __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
+    __m128                  fix_SSE0, fiy_SSE0, fiz_SSE0;
+    __m128                  fix_SSE1, fiy_SSE1, fiz_SSE1;
+    __m128                  fix_SSE2, fiy_SSE2, fiz_SSE2;
+    __m128                  fix_SSE3, fiy_SSE3, fiz_SSE3;
+    __m128                  rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
+    __m128                  imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
+    __m128                  jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
+    __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
+    __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
+    __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
+    __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
+    __m128                  fgb_SSE0, fgb_ai_SSE0;
+    __m128                  fgb_SSE1, fgb_ai_SSE1;
+    __m128                  fgb_SSE2, fgb_ai_SSE2;
+    __m128                  fgb_SSE3, fgb_ai_SSE3;
+    __m128                  tx_SSE0, ty_SSE0, tz_SSE0;
+    __m128                  tx_SSE1, ty_SSE1, tz_SSE1;
+    __m128                  tx_SSE2, ty_SSE2, tz_SSE2;
+    __m128                  tx_SSE3, ty_SSE3, tz_SSE3;
+    __m128                  t1, t2;
+
     natoms              = mdatoms->nr;
-       ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
-       ni1                 = mdatoms->start+mdatoms->homenr;
+    ni0                 = 0;
+    ni1                 = mdatoms->homenr;
     dadx                = fr->dadx;
-    
+
     aadata = (gmx_allvsallgb2_data_t *)paadata;
 
-    x_align = aadata->x_align;
-       y_align = aadata->y_align;
-       z_align = aadata->z_align;
+    x_align  = aadata->x_align;
+    y_align  = aadata->y_align;
+    z_align  = aadata->z_align;
     fx_align = aadata->fx_align;
-       fy_align = aadata->fy_align;
-       fz_align = aadata->fz_align;
-    
+    fy_align = aadata->fy_align;
+    fz_align = aadata->fz_align;
+
     jindex    = aadata->jindex_gb;
     dadx      = fr->dadx;
 
-    n = 0;
+    n  = 0;
     rb = aadata->work;
-    
-       /* Loop to get the proper form for the Born radius term */
-       if(gb_algorithm==egbSTILL) 
-       {
-               for(i=0;i<natoms;i++)
-               {
-                       rbi   = born->bRad[i];
-                       rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
-               }
-       }
-       else if(gb_algorithm==egbHCT) 
-       {
-               for(i=0;i<natoms;i++)
-               {
-                       rbi   = born->bRad[i];
-                       rb[i] = rbi * rbi * fr->dvda[i];
-               }
-       }
-       else if(gb_algorithm==egbOBC) 
-       {
-               for(idx=0;idx<natoms;idx++)
-               {
-                       rbi   = born->bRad[idx];
-                       rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
-               }
-       }
-    
-    for(i=0;i<2*natoms;i++)
-       {
-               fx_align[i]       = 0;
-               fy_align[i]       = 0;
-               fz_align[i]       = 0;
-    }        
-    
-    
-    for(i=0;i<natoms;i++)
+
+    /* Loop to get the proper form for the Born radius term */
+    if (gb_algorithm == egbSTILL)
+    {
+        for (i = 0; i < natoms; i++)
+        {
+            rbi   = born->bRad[i];
+            rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
+        }
+    }
+    else if (gb_algorithm == egbHCT)
+    {
+        for (i = 0; i < natoms; i++)
+        {
+            rbi   = born->bRad[i];
+            rb[i] = rbi * rbi * fr->dvda[i];
+        }
+    }
+    else if (gb_algorithm == egbOBC)
+    {
+        for (idx = 0; idx < natoms; idx++)
+        {
+            rbi     = born->bRad[idx];
+            rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
+        }
+    }
+
+    for (i = 0; i < 2*natoms; i++)
     {
-        rb[i+natoms]=rb[i];
+        fx_align[i]       = 0;
+        fy_align[i]       = 0;
+        fz_align[i]       = 0;
     }
 
-    for(i=ni0; i<ni1; i+=UNROLLI)
-       {
-               /* We assume shifts are NOT used for all-vs-all interactions */
-               
-               /* Load i atom data */
-               ix_SSE0          = _mm_load1_ps(x_align+i);
-               iy_SSE0          = _mm_load1_ps(y_align+i);
-               iz_SSE0          = _mm_load1_ps(z_align+i);
-               ix_SSE1          = _mm_load1_ps(x_align+i+1);
-               iy_SSE1          = _mm_load1_ps(y_align+i+1);
-               iz_SSE1          = _mm_load1_ps(z_align+i+1);
-               ix_SSE2          = _mm_load1_ps(x_align+i+2);
-               iy_SSE2          = _mm_load1_ps(y_align+i+2);
-               iz_SSE2          = _mm_load1_ps(z_align+i+2);
-               ix_SSE3          = _mm_load1_ps(x_align+i+3);
-               iy_SSE3          = _mm_load1_ps(y_align+i+3);
-               iz_SSE3          = _mm_load1_ps(z_align+i+3);
-        
-               fix_SSE0         = _mm_setzero_ps();
-               fiy_SSE0         = _mm_setzero_ps();
-               fiz_SSE0         = _mm_setzero_ps();
-               fix_SSE1         = _mm_setzero_ps();
-               fiy_SSE1         = _mm_setzero_ps();
-               fiz_SSE1         = _mm_setzero_ps();
-               fix_SSE2         = _mm_setzero_ps();
-               fiy_SSE2         = _mm_setzero_ps();
-               fiz_SSE2         = _mm_setzero_ps();
-               fix_SSE3         = _mm_setzero_ps();
-               fiy_SSE3         = _mm_setzero_ps();
-               fiz_SSE3         = _mm_setzero_ps();
-        
+
+    for (i = 0; i < natoms; i++)
+    {
+        rb[i+natoms] = rb[i];
+    }
+
+    for (i = ni0; i < ni1; i += UNROLLI)
+    {
+        /* We assume shifts are NOT used for all-vs-all interactions */
+
+        /* Load i atom data */
+        ix_SSE0          = _mm_load1_ps(x_align+i);
+        iy_SSE0          = _mm_load1_ps(y_align+i);
+        iz_SSE0          = _mm_load1_ps(z_align+i);
+        ix_SSE1          = _mm_load1_ps(x_align+i+1);
+        iy_SSE1          = _mm_load1_ps(y_align+i+1);
+        iz_SSE1          = _mm_load1_ps(z_align+i+1);
+        ix_SSE2          = _mm_load1_ps(x_align+i+2);
+        iy_SSE2          = _mm_load1_ps(y_align+i+2);
+        iz_SSE2          = _mm_load1_ps(z_align+i+2);
+        ix_SSE3          = _mm_load1_ps(x_align+i+3);
+        iy_SSE3          = _mm_load1_ps(y_align+i+3);
+        iz_SSE3          = _mm_load1_ps(z_align+i+3);
+
+        fix_SSE0         = _mm_setzero_ps();
+        fiy_SSE0         = _mm_setzero_ps();
+        fiz_SSE0         = _mm_setzero_ps();
+        fix_SSE1         = _mm_setzero_ps();
+        fiy_SSE1         = _mm_setzero_ps();
+        fiz_SSE1         = _mm_setzero_ps();
+        fix_SSE2         = _mm_setzero_ps();
+        fiy_SSE2         = _mm_setzero_ps();
+        fiz_SSE2         = _mm_setzero_ps();
+        fix_SSE3         = _mm_setzero_ps();
+        fiy_SSE3         = _mm_setzero_ps();
+        fiz_SSE3         = _mm_setzero_ps();
+
         rbai_SSE0        = _mm_load1_ps(rb+i);
         rbai_SSE1        = _mm_load1_ps(rb+i+1);
         rbai_SSE2        = _mm_load1_ps(rb+i+2);
         rbai_SSE3        = _mm_load1_ps(rb+i+3);
-        
-               /* Load limits for loop over neighbors */
-               nj0              = jindex[4*i];
-               nj3              = jindex[4*i+3];
+
+        /* Load limits for loop over neighbors */
+        nj0              = jindex[4*i];
+        nj3              = jindex[4*i+3];
 
         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
-        for(j=nj0; j<nj3; j+=UNROLLJ)
-        {          
+        for (j = nj0; j < nj3; j += UNROLLJ)
+        {
             /* load j atom coordinates */
             jx_SSE           = _mm_load_ps(x_align+j);
             jy_SSE           = _mm_load_ps(y_align+j);
             jz_SSE           = _mm_load_ps(z_align+j);
-            
+
             /* Calculate distance */
-            dx_SSE0          = _mm_sub_ps(ix_SSE0,jx_SSE);
-            dy_SSE0          = _mm_sub_ps(iy_SSE0,jy_SSE);
-            dz_SSE0          = _mm_sub_ps(iz_SSE0,jz_SSE);
-            dx_SSE1          = _mm_sub_ps(ix_SSE1,jx_SSE);
-            dy_SSE1          = _mm_sub_ps(iy_SSE1,jy_SSE);
-            dz_SSE1          = _mm_sub_ps(iz_SSE1,jz_SSE);
-            dx_SSE2          = _mm_sub_ps(ix_SSE2,jx_SSE);
-            dy_SSE2          = _mm_sub_ps(iy_SSE2,jy_SSE);
-            dz_SSE2          = _mm_sub_ps(iz_SSE2,jz_SSE);
-            dx_SSE3          = _mm_sub_ps(ix_SSE3,jx_SSE);
-            dy_SSE3          = _mm_sub_ps(iy_SSE3,jy_SSE);
-            dz_SSE3          = _mm_sub_ps(iz_SSE3,jz_SSE);
-            
+            dx_SSE0          = _mm_sub_ps(ix_SSE0, jx_SSE);
+            dy_SSE0          = _mm_sub_ps(iy_SSE0, jy_SSE);
+            dz_SSE0          = _mm_sub_ps(iz_SSE0, jz_SSE);
+            dx_SSE1          = _mm_sub_ps(ix_SSE1, jx_SSE);
+            dy_SSE1          = _mm_sub_ps(iy_SSE1, jy_SSE);
+            dz_SSE1          = _mm_sub_ps(iz_SSE1, jz_SSE);
+            dx_SSE2          = _mm_sub_ps(ix_SSE2, jx_SSE);
+            dy_SSE2          = _mm_sub_ps(iy_SSE2, jy_SSE);
+            dz_SSE2          = _mm_sub_ps(iz_SSE2, jz_SSE);
+            dx_SSE3          = _mm_sub_ps(ix_SSE3, jx_SSE);
+            dy_SSE3          = _mm_sub_ps(iy_SSE3, jy_SSE);
+            dz_SSE3          = _mm_sub_ps(iz_SSE3, jz_SSE);
+
             rbaj_SSE         = _mm_load_ps(rb+j);
-            
-            fgb_SSE0         = _mm_mul_ps(rbai_SSE0,_mm_load_ps(dadx));            
-            dadx += 4;
-            fgb_SSE1         = _mm_mul_ps(rbai_SSE1,_mm_load_ps(dadx));
-            dadx += 4;
-            fgb_SSE2         = _mm_mul_ps(rbai_SSE2,_mm_load_ps(dadx));
-            dadx += 4;
-            fgb_SSE3         = _mm_mul_ps(rbai_SSE3,_mm_load_ps(dadx));
-            dadx += 4;
-                        
-            fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
-            dadx +=4;
-            fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
-            dadx +=4;
-            fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
-            dadx +=4;
-            fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
-            dadx +=4;
-            
+
+            fgb_SSE0         = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
+            dadx            += 4;
+            fgb_SSE1         = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
+            dadx            += 4;
+            fgb_SSE2         = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
+            dadx            += 4;
+            fgb_SSE3         = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
+            dadx            += 4;
+
+            fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
+            dadx            += 4;
+            fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
+            dadx            += 4;
+            fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
+            dadx            += 4;
+            fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
+            dadx            += 4;
+
             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
-            fgb_SSE0         = _mm_add_ps(fgb_SSE0,fgb_ai_SSE0);
-            fgb_SSE1         = _mm_add_ps(fgb_SSE1,fgb_ai_SSE1);
-            fgb_SSE2         = _mm_add_ps(fgb_SSE2,fgb_ai_SSE2);
-            fgb_SSE3         = _mm_add_ps(fgb_SSE3,fgb_ai_SSE3);
-                            
+            fgb_SSE0         = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
+            fgb_SSE1         = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
+            fgb_SSE2         = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
+            fgb_SSE3         = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
+
             /* Calculate temporary vectorial force */
-            tx_SSE0            = _mm_mul_ps(fgb_SSE0,dx_SSE0);
-            ty_SSE0            = _mm_mul_ps(fgb_SSE0,dy_SSE0);
-            tz_SSE0            = _mm_mul_ps(fgb_SSE0,dz_SSE0);
-            tx_SSE1            = _mm_mul_ps(fgb_SSE1,dx_SSE1);
-            ty_SSE1            = _mm_mul_ps(fgb_SSE1,dy_SSE1);
-            tz_SSE1            = _mm_mul_ps(fgb_SSE1,dz_SSE1);
-            tx_SSE2            = _mm_mul_ps(fgb_SSE2,dx_SSE2);
-            ty_SSE2            = _mm_mul_ps(fgb_SSE2,dy_SSE2);
-            tz_SSE2            = _mm_mul_ps(fgb_SSE2,dz_SSE2);
-            tx_SSE3            = _mm_mul_ps(fgb_SSE3,dx_SSE3);
-            ty_SSE3            = _mm_mul_ps(fgb_SSE3,dy_SSE3);
-            tz_SSE3            = _mm_mul_ps(fgb_SSE3,dz_SSE3);
-            
+            tx_SSE0            = _mm_mul_ps(fgb_SSE0, dx_SSE0);
+            ty_SSE0            = _mm_mul_ps(fgb_SSE0, dy_SSE0);
+            tz_SSE0            = _mm_mul_ps(fgb_SSE0, dz_SSE0);
+            tx_SSE1            = _mm_mul_ps(fgb_SSE1, dx_SSE1);
+            ty_SSE1            = _mm_mul_ps(fgb_SSE1, dy_SSE1);
+            tz_SSE1            = _mm_mul_ps(fgb_SSE1, dz_SSE1);
+            tx_SSE2            = _mm_mul_ps(fgb_SSE2, dx_SSE2);
+            ty_SSE2            = _mm_mul_ps(fgb_SSE2, dy_SSE2);
+            tz_SSE2            = _mm_mul_ps(fgb_SSE2, dz_SSE2);
+            tx_SSE3            = _mm_mul_ps(fgb_SSE3, dx_SSE3);
+            ty_SSE3            = _mm_mul_ps(fgb_SSE3, dy_SSE3);
+            tz_SSE3            = _mm_mul_ps(fgb_SSE3, dz_SSE3);
+
             /* Increment i atom force */
-            fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
-            fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
-            fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
-            fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
-            fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
-            fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
-            fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
-            fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
-            fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
-            fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
-            fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
-            fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
-            
+            fix_SSE0          = _mm_add_ps(fix_SSE0, tx_SSE0);
+            fiy_SSE0          = _mm_add_ps(fiy_SSE0, ty_SSE0);
+            fiz_SSE0          = _mm_add_ps(fiz_SSE0, tz_SSE0);
+            fix_SSE1          = _mm_add_ps(fix_SSE1, tx_SSE1);
+            fiy_SSE1          = _mm_add_ps(fiy_SSE1, ty_SSE1);
+            fiz_SSE1          = _mm_add_ps(fiz_SSE1, tz_SSE1);
+            fix_SSE2          = _mm_add_ps(fix_SSE2, tx_SSE2);
+            fiy_SSE2          = _mm_add_ps(fiy_SSE2, ty_SSE2);
+            fiz_SSE2          = _mm_add_ps(fiz_SSE2, tz_SSE2);
+            fix_SSE3          = _mm_add_ps(fix_SSE3, tx_SSE3);
+            fiy_SSE3          = _mm_add_ps(fiy_SSE3, ty_SSE3);
+            fiz_SSE3          = _mm_add_ps(fiz_SSE3, tz_SSE3);
+
             /* Decrement j atom force */
             _mm_store_ps(fx_align+j,
-                         _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
+                         _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
             _mm_store_ps(fy_align+j,
-                         _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
+                         _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
             _mm_store_ps(fz_align+j,
-                         _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
+                         _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
         }
-               /* Add i forces to mem and shifted force list */
-        _MM_TRANSPOSE4_PS(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3);
-        fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE1);
-        fix_SSE2 = _mm_add_ps(fix_SSE2,fix_SSE3);
-        fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE2);
+        /* Add i forces to mem and shifted force list */
+        _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
+        fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
+        fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
+        fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
-        
-        _MM_TRANSPOSE4_PS(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3);
-        fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE1);
-        fiy_SSE2 = _mm_add_ps(fiy_SSE2,fiy_SSE3);
-        fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE2);
+
+        _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
+        fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
+        fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
+        fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
-        
-        _MM_TRANSPOSE4_PS(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3);
-        fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE1);
-        fiz_SSE2 = _mm_add_ps(fiz_SSE2,fiz_SSE3);
-        fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE2);
-        _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));               
-       }    
-       
-    for(i=0;i<natoms;i++)
-       {
-               f[3*i]       += fx_align[i] + fx_align[natoms+i];
-               f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
-               f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
+
+        _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
+        fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
+        fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
+        fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
+        _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
     }
-    
-       return 0;
+
+    for (i = 0; i < natoms; i++)
+    {
+        f[3*i]       += fx_align[i] + fx_align[natoms+i];
+        f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
+        f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
+    }
+
+    return 0;
 }
 
 #else
@@ -3504,4 +3498,3 @@ int genborn_allvsall_sse2_single_dummy;
 
 
 #endif
-