From ea99912691d0385f4d614c0383aefed35f67adc1 Mon Sep 17 00:00:00 2001
From: Roland Schulz <roland@utk.edu>
Date: Fri, 17 Jan 2014 23:47:30 -0500
Subject: [PATCH] Uncrustify all files

Add a script that can be used to do this again if necessary.  The script
also supports listing the files based on the filter attribute, and
checking the copyright; can be extended if/when there is need.

Some manual reformatting in gpu_utils.cu, nbnxn_cuda_data_mgmt.cu, and
nbnxn_cuda_kernel.cuh.  Updated the Sparc kernel generator to generate
code that is invariant under uncrustify, like the other kernels already
do.

Copyright not updated, except for fixing readpull.c such that the script
doesn't complain.

Part of #845

Change-Id: Ia77738ec781f75f1c4e7a264734aac884321f3e5
---
 .gitattributes                                |    2 +-
 admin/reformat_all.sh                         |  118 ++
 src/gromacs/fileio/gmxfio_int.h               |   12 +-
 src/gromacs/fileio/mdoutf.c                   |   12 +-
 src/gromacs/fileio/timecontrol.c              |    2 +-
 src/gromacs/fileio/tngio.cpp                  |    6 +-
 src/gromacs/fileio/tngio_for_tools.cpp        |   50 +-
 src/gromacs/gmxana/cmat.c                     |    8 +-
 src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu |   10 +-
 src/gromacs/gmxlib/cuda_tools/cudautils.cu    |   36 +-
 src/gromacs/gmxlib/cuda_tools/cudautils.cuh   |   22 +-
 src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu |   24 +-
 src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh |   14 +-
 src/gromacs/gmxlib/gmx_detect_hardware.c      |    2 +-
 src/gromacs/gmxlib/gmx_fatal.c                |   28 +-
 src/gromacs/gmxlib/gpu_utils/gpu_utils.cu     |  245 ++--
 .../kernelutil_sparc64_hpc_ace_double.h       | 1034 ++++++++---------
 .../make_nb_kernel_sparc64_hpc_ace_double.py  |    4 +-
 .../nb_kernel_sparc64_hpc_ace_double.c        |    4 +-
 .../nb_kernel_sparc64_hpc_ace_double.h        |    4 +-
 src/gromacs/gmxpreprocess/gen_vsite.c         |   52 +-
 src/gromacs/gmxpreprocess/readpull.c          |   14 +-
 src/gromacs/gmxpreprocess/toppush.c           |   16 +-
 .../legacyheaders/thread_mpi/atomic/gcc_x86.h |    2 +-
 .../legacyheaders/thread_mpi/atomic/xlc_ppc.h |    2 +-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu    |  108 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |  214 ++--
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh    |   84 +-
 .../nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh    |   32 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh   |   12 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_types.h       |    8 +-
 .../nbnxn_kernels/nbnxn_kernel_simd_utils.h   |    2 +-
 .../nbnxn_kernel_simd_utils_x86_mic.h         |    2 +-
 .../simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h  |    4 +-
 .../simd_4xn/nbnxn_kernel_simd_4xn_inner.h    |   10 +-
 src/gromacs/mdlib/ns.c                        |    4 +-
 src/gromacs/mdlib/pme_simd4.h                 |   22 +-
 src/gromacs/mdlib/qm_orca.c                   |    6 +-
 src/gromacs/simd/macros.h                     |    8 +-
 src/gromacs/simd/math_single.h                |    2 +-
 src/gromacs/tools/dump.c                      |   18 +-
 src/gromacs/utility/init.cpp                  |    4 +-
 .../mdrun/tests/compressed_x_output.cpp       |    2 +-
 src/programs/mdrun/tests/rerun.cpp            |    2 +-
 44 files changed, 1234 insertions(+), 1033 deletions(-)
 create mode 100755 admin/reformat_all.sh

diff --git a/.gitattributes b/.gitattributes
index af6711a0b6..1e09923537 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -14,7 +14,7 @@ CMakeLists.txt  filter=copyright
 *.tex           filter=copyright
 *.bm            filter=copyright
 # Exceptions: extra files to include
-admin/uncrustify.sh                     filter=copyright
+admin/*.sh                              filter=copyright
 admin/git-pre-commit                    filter=copyright
 # Exceptions: files to exclude
 *.pc.cmakein                            !filter
diff --git a/admin/reformat_all.sh b/admin/reformat_all.sh
new file mode 100755
index 0000000000..23611ea70a
--- /dev/null
+++ b/admin/reformat_all.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2014, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+function usage() {
+    echo "usage: reformat_all.sh [-f|--force]"
+    echo "           [--filter=(uncrustify|copyright)] [<action>]"
+    echo "<action>: (list-files|uncrustify*|copyright) (*=default)"
+}
+
+filter=default
+force=
+action=uncrustify
+for arg in "$@" ; do
+    if [[ "$arg" == "list-files" || "$arg" == "uncrustify" ||
+          "$arg" == "copyright" ]] ; then
+        action=$arg
+    elif [[ "$arg" == --filter=* ]] ; then
+        filter=${arg#--filter=}
+    elif [[ "$arg" == "-f" || "$arg" == "--force" ]] ; then
+        force=1
+    else
+        echo "Unknown option: $arg"
+        echo
+        usage
+        exit 2
+    fi
+done
+
+if [[ ! "$force" && "$action" != "list-files" ]] ; then
+    if ! git diff-files --quiet ; then
+        echo "Modified files found in work tree. Use -f to override."
+        exit 1
+    fi
+fi
+
+case "$action" in
+    list-files)
+        command=cat
+        ;;
+    uncrustify)
+        # Check that uncrustify is present
+        if [ -z "$UNCRUSTIFY" ] ; then
+            echo "Please set the path to uncrustify using UNCRUSTIFY."
+            echo "Note that you need a custom version of uncrustify."
+            echo "See comments in the script file for how to get one."
+            exit 2
+        fi
+        if ! which "$UNCRUSTIFY" 1>/dev/null ; then
+            echo "Uncrustify not found: $UNCRUSTIFY"
+            exit 2
+        fi
+        command="xargs $UNCRUSTIFY -c admin/uncrustify.cfg --no-backup"
+        ;;
+    copyright)
+        command="xargs admin/copyright.py --check"
+        ;;
+    *)
+        echo "Unknown action: $action"
+        exit 2
+esac
+
+if [[ "$filter" == "default" ]] ; then
+    filter=$action
+fi
+
+case "$filter" in
+    uncrustify)
+        filter_re="(uncrustify|uncrustify_only)"
+        ;;
+    copyright)
+        filter_re="(uncrustify|copyright)"
+        ;;
+    *)
+        echo "Unknown filter mode: $filter"
+        echo
+        usage
+        exit 2
+esac
+
+cd `git rev-parse --show-toplevel`
+
+if ! git ls-tree -r --name-only HEAD | git check-attr --stdin filter | \
+    sed -nEe "/${filter_re}$/ {s/:.*//;p;}" | $command ; then
+    echo "The reformatting command failed! Please check the output."
+    exit 1
+fi
diff --git a/src/gromacs/fileio/gmxfio_int.h b/src/gromacs/fileio/gmxfio_int.h
index b0ffbc3349..9399a966b9 100644
--- a/src/gromacs/fileio/gmxfio_int.h
+++ b/src/gromacs/fileio/gmxfio_int.h
@@ -85,14 +85,14 @@ struct t_fileio
                     bDebug,            /* the file ops should come with debug info */
                     bStdio,            /* the file is actually stdin or stdout */
                     bReadWrite;        /* the file is open for reading and writing */
-    char       *fn;                    /* the file name */
-    XDR        *xdr;                   /* the xdr data pointer */
-    enum xdr_op xdrmode;               /* the xdr mode */
-    int         iFTP;                  /* the file type identifier */
+    char        *fn;                   /* the file name */
+    XDR         *xdr;                  /* the xdr data pointer */
+    enum xdr_op  xdrmode;              /* the xdr mode */
+    int          iFTP;                 /* the file type identifier */
 
-    const char *comment;               /* a comment string for debugging */
+    const char  *comment;              /* a comment string for debugging */
 
-    t_fileio   *next, *prev;           /* next and previous file pointers in the
+    t_fileio    *next, *prev;          /* next and previous file pointers in the
                                           linked list */
     tMPI_Lock_t  mtx;                  /* content locking mutex. This is a fast lock
                                           for performance reasons: in some cases every
diff --git a/src/gromacs/fileio/mdoutf.c b/src/gromacs/fileio/mdoutf.c
index eb08027c55..3e17d3b111 100644
--- a/src/gromacs/fileio/mdoutf.c
+++ b/src/gromacs/fileio/mdoutf.c
@@ -86,10 +86,10 @@ gmx_mdoutf_t init_mdoutf(int nfile, const t_filenm fnm[], int mdrun_flags,
     of->fp_dhdl      = NULL;
     of->fp_field     = NULL;
 
-    of->eIntegrator     = ir->eI;
-    of->bExpanded       = ir->bExpanded;
-    of->elamstats       = ir->expandedvals->elamstats;
-    of->simulation_part = ir->simulation_part;
+    of->eIntegrator             = ir->eI;
+    of->bExpanded               = ir->bExpanded;
+    of->elamstats               = ir->expandedvals->elamstats;
+    of->simulation_part         = ir->simulation_part;
     of->x_compression_precision = ir->x_compression_precision;
 
     if (MASTER(cr))
@@ -190,8 +190,8 @@ gmx_mdoutf_t init_mdoutf(int nfile, const t_filenm fnm[], int mdrun_flags,
            trajectory-writing routines later. Also, XTC writing needs
            to know what (and how many) atoms might be in the XTC
            groups, and how to look up later which ones they are. */
-        of->natoms_global = top_global->natoms;
-        of->groups        = &top_global->groups;
+        of->natoms_global       = top_global->natoms;
+        of->groups              = &top_global->groups;
         of->natoms_x_compressed = 0;
         for (i = 0; (i < top_global->natoms); i++)
         {
diff --git a/src/gromacs/fileio/timecontrol.c b/src/gromacs/fileio/timecontrol.c
index 6d9aed6824..1f084a1a0d 100644
--- a/src/gromacs/fileio/timecontrol.c
+++ b/src/gromacs/fileio/timecontrol.c
@@ -54,7 +54,7 @@ typedef struct {
     gmx_bool bSet;
 } t_timecontrol;
 
-static t_timecontrol timecontrol[TNR] = {
+static t_timecontrol       timecontrol[TNR] = {
     { 0, FALSE },
     { 0, FALSE },
     { 0, FALSE }
diff --git a/src/gromacs/fileio/tngio.cpp b/src/gromacs/fileio/tngio.cpp
index 44f68752bf..62c274df62 100644
--- a/src/gromacs/fileio/tngio.cpp
+++ b/src/gromacs/fileio/tngio.cpp
@@ -692,9 +692,9 @@ void gmx_fwrite_tng(tng_trajectory_t tng,
 #else
     static write_data_func_pointer           write_data           = tng_util_generic_with_time_write;
 #endif
-    double                  elapsedSeconds = elapsedPicoSeconds * PICO;
-    gmx_int64_t             nParticles;
-    char                    compression;
+    double                                   elapsedSeconds = elapsedPicoSeconds * PICO;
+    gmx_int64_t                              nParticles;
+    char                                     compression;
 
 
     if (!tng)
diff --git a/src/gromacs/fileio/tngio_for_tools.cpp b/src/gromacs/fileio/tngio_for_tools.cpp
index 9784f890dc..eea391694a 100644
--- a/src/gromacs/fileio/tngio_for_tools.cpp
+++ b/src/gromacs/fileio/tngio_for_tools.cpp
@@ -527,22 +527,22 @@ gmx_bool gmx_read_next_tng_frame(tng_trajectory_t            input,
                 for (int i = 0; i < DIM; i++)
                 {
                     convert_array_to_real_array((char *)(values) + size * i * DIM,
-                                                    (real *) fr->box[i],
-                                                    getDistanceScaleFactor(input),
-                                                    1,
-                                                    DIM,
-                                                    datatype);
+                                                (real *) fr->box[i],
+                                                getDistanceScaleFactor(input),
+                                                1,
+                                                DIM,
+                                                datatype);
                 }
                 fr->bBox = TRUE;
                 break;
             case TNG_TRAJ_POSITIONS:
                 srenew(fr->x, fr->natoms);
                 convert_array_to_real_array(values,
-                                                (real *) fr->x,
-                                                getDistanceScaleFactor(input),
-                                                fr->natoms,
-                                                DIM,
-                                                datatype);
+                                            (real *) fr->x,
+                                            getDistanceScaleFactor(input),
+                                            fr->natoms,
+                                            DIM,
+                                            datatype);
                 fr->bX = TRUE;
                 tng_util_frame_current_compression_get(input, blockId, &codecId, &prec);
                 /* This must be updated if/when more lossy compression methods are added */
@@ -555,11 +555,11 @@ gmx_bool gmx_read_next_tng_frame(tng_trajectory_t            input,
             case TNG_TRAJ_VELOCITIES:
                 srenew(fr->v, fr->natoms);
                 convert_array_to_real_array(values,
-                                                (real *) fr->v,
-                                                getDistanceScaleFactor(input),
-                                                fr->natoms,
-                                                DIM,
-                                                datatype);
+                                            (real *) fr->v,
+                                            getDistanceScaleFactor(input),
+                                            fr->natoms,
+                                            DIM,
+                                            datatype);
                 fr->bV = TRUE;
                 tng_util_frame_current_compression_get(input, blockId, &codecId, &prec);
                 /* This must be updated if/when more lossy compression methods are added */
@@ -572,11 +572,11 @@ gmx_bool gmx_read_next_tng_frame(tng_trajectory_t            input,
             case TNG_TRAJ_FORCES:
                 srenew(fr->f, fr->natoms);
                 convert_array_to_real_array(values,
-                                                (real *) fr->f,
-                                                getDistanceScaleFactor(input),
-                                                fr->natoms,
-                                                DIM,
-                                                datatype);
+                                            (real *) fr->f,
+                                            getDistanceScaleFactor(input),
+                                            fr->natoms,
+                                            DIM,
+                                            datatype);
                 fr->bF = TRUE;
                 break;
             case TNG_GMX_LAMBDA:
@@ -824,11 +824,11 @@ gmx_bool gmx_get_tng_data_next_frame_of_block_type(tng_trajectory_t     input,
     }
     snew(*values, sizeof(real) * *nValuesPerFrame * *nAtoms);
     convert_array_to_real_array(data,
-                                    *values,
-                                    getDistanceScaleFactor(input),
-                                    *nAtoms,
-                                    *nValuesPerFrame,
-                                    datatype);
+                                *values,
+                                getDistanceScaleFactor(input),
+                                *nAtoms,
+                                *nValuesPerFrame,
+                                datatype);
 
     tng_util_frame_current_compression_get(input, blockId, &codecId, &localPrec);
 
diff --git a/src/gromacs/gmxana/cmat.c b/src/gromacs/gmxana/cmat.c
index ee8438f32b..c0029e27d5 100644
--- a/src/gromacs/gmxana/cmat.c
+++ b/src/gromacs/gmxana/cmat.c
@@ -72,19 +72,19 @@ void copy_t_mat(t_mat *dst, t_mat *src)
 
     if (dst->nn != src->nn)
     {
-        fprintf(stderr, "t_mat structures not identical in size dst %d src %d\n",dst->nn,src->nn);
+        fprintf(stderr, "t_mat structures not identical in size dst %d src %d\n", dst->nn, src->nn);
         return;
     }
     dst->maxrms = src->maxrms;
     dst->minrms = src->minrms;
     dst->sumrms = src->sumrms;
-    for(i = 0; (i < src->nn); i++)
+    for (i = 0; (i < src->nn); i++)
     {
-        for(j = 0; (j < src->nn); j++)
+        for (j = 0; (j < src->nn); j++)
         {
             dst->mat[i][j] = src->mat[i][j];
         }
-        dst->erow[i] = src->erow[i];
+        dst->erow[i]  = src->erow[i];
         dst->m_ind[i] = src->m_ind[i];
     }
 }
diff --git a/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu b/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
index 26eda4ed65..aa204263fe 100644
--- a/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
+++ b/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
@@ -45,13 +45,13 @@
 
 void gmx_print_version_info_gpu(FILE *fp)
 {
-    int cuda_driver,cuda_runtime;
-    fprintf(fp, "CUDA compiler:      %s\n",CUDA_NVCC_COMPILER_INFO);
-    fprintf(fp, "CUDA compiler flags:%s\n",CUDA_NVCC_COMPILER_FLAGS);
+    int cuda_driver, cuda_runtime;
+    fprintf(fp, "CUDA compiler:      %s\n", CUDA_NVCC_COMPILER_INFO);
+    fprintf(fp, "CUDA compiler flags:%s\n", CUDA_NVCC_COMPILER_FLAGS);
     cuda_driver = 0;
     cudaDriverGetVersion(&cuda_driver);
     cuda_runtime = 0;
     cudaRuntimeGetVersion(&cuda_runtime);
-    fprintf(fp, "CUDA driver:        %d.%d\n",cuda_driver/1000, cuda_driver%100);
-    fprintf(fp, "CUDA runtime:       %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
+    fprintf(fp, "CUDA driver:        %d.%d\n", cuda_driver/1000, cuda_driver%100);
+    fprintf(fp, "CUDA runtime:       %d.%d\n", cuda_runtime/1000, cuda_runtime%100);
 }
diff --git a/src/gromacs/gmxlib/cuda_tools/cudautils.cu b/src/gromacs/gmxlib/cuda_tools/cudautils.cu
index 467c3ce0bb..e8f32b2332 100644
--- a/src/gromacs/gmxlib/cuda_tools/cudautils.cu
+++ b/src/gromacs/gmxlib/cuda_tools/cudautils.cu
@@ -46,13 +46,15 @@
  *
  *  The copy is launched in stream s or if not specified, in stream 0.
  */
-static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes, 
+static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
                                bool bAsync = false, cudaStream_t s = 0)
 {
     cudaError_t stat;
-    
+
     if (h_dest == NULL || d_src == NULL || bytes == 0)
+    {
         return -1;
+    }
 
     if (bAsync)
     {
@@ -83,9 +85,11 @@ int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s
 }
 
 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
-{ 
+{
     if (h_dest == NULL || d_src == NULL || bytes == 0)
+    {
         return -1;
+    }
 
     smalloc(*h_dest, bytes);
 
@@ -96,13 +100,15 @@ int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
  *
  *  The copy is launched in stream s or if not specified, in stream 0.
  */
-static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes, 
+static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
                                bool bAsync = false, cudaStream_t s = 0)
 {
     cudaError_t stat;
 
     if (d_dest == NULL || h_src == NULL || bytes == 0)
+    {
         return -1;
+    }
 
     if (bAsync)
     {
@@ -119,7 +125,7 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
 }
 
 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
-{   
+{
     return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
 }
 
@@ -127,7 +133,7 @@ int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
  *  The copy is launched in stream s or if not specified, in stream 0.
  */
 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
-{   
+{
     return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
 }
 
@@ -136,7 +142,9 @@ int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
     cudaError_t stat;
 
     if (d_dest == NULL || h_src == NULL || bytes == 0)
+    {
         return -1;
+    }
 
     stat = cudaMalloc(d_dest, bytes);
     CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
@@ -146,7 +154,7 @@ int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
 
 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
 {
-    float t = 0.0;
+    float       t = 0.0;
     cudaError_t stat;
 
     stat = cudaEventElapsedTime(&t, start, end);
@@ -165,10 +173,10 @@ int cu_wait_event(cudaEvent_t e)
     return 0;
 }
 
-/*! 
+/*!
  *  If time != NULL it also calculates the time elapsed between start and end and
  *  return this is milliseconds.
- */ 
+ */
 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
 {
     cudaError_t s;
@@ -211,11 +219,11 @@ void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 }
 
 /*!
- *  Reallocation of the memory pointed by d_ptr and copying of the data from 
- *  the location pointed by h_src host-side pointer is done. Allocation is 
- *  buffered and therefore freeing is only needed if the previously allocated 
+ *  Reallocation of the memory pointed by d_ptr and copying of the data from
+ *  the location pointed by h_src host-side pointer is done. Allocation is
+ *  buffered and therefore freeing is only needed if the previously allocated
  *  space is not enough.
- *  The H2D copy is launched in stream s and can be done synchronously or 
+ *  The H2D copy is launched in stream s and can be done synchronously or
  *  asynchronously (the default is the latter).
  */
 void cu_realloc_buffered(void **d_dest, void *h_src,
@@ -232,7 +240,7 @@ void cu_realloc_buffered(void **d_dest, void *h_src,
         return;
     }
 
-    /* reallocate only if the data does not fit = allocation size is smaller 
+    /* reallocate only if the data does not fit = allocation size is smaller
        than the current requested size */
     if (req_size > *curr_alloc_size)
     {
diff --git a/src/gromacs/gmxlib/cuda_tools/cudautils.cuh b/src/gromacs/gmxlib/cuda_tools/cudautils.cuh
index 6ede841f3d..2cccc16c67 100644
--- a/src/gromacs/gmxlib/cuda_tools/cudautils.cuh
+++ b/src/gromacs/gmxlib/cuda_tools/cudautils.cuh
@@ -46,16 +46,16 @@
    with them (e.g. expected warp size = 32, check against the dev_info->props.warpsize). */
 #define WARP_SIZE           32
 
-/* TODO error checking needs to be rewritten. We have 2 types of error checks needed 
-   based on where they occur in the code: 
-   - non performance-critical: these errors are unsafe to be ignored and must be 
+/* TODO error checking needs to be rewritten. We have 2 types of error checks needed
+   based on where they occur in the code:
+   - non performance-critical: these errors are unsafe to be ignored and must be
      _always_ checked for, e.g. initializations
    - performance critical: handling errors might hurt performance so care need to be taken
-     when/if we should check for them at all, e.g. in cu_upload_X. However, we should be 
+     when/if we should check for them at all, e.g. in cu_upload_X. However, we should be
      able to turn the check for these errors on!
 
-  Probably we'll need two sets of the macros below... 
- 
+   Probably we'll need two sets of the macros below...
+
  */
 #define CHECK_CUDA_ERRORS
 
@@ -79,8 +79,8 @@
         } \
     } while (0)
 
-/*! Check for any previously occurred uncaught CUDA error 
-  -- aimed at use after kernel calls. */
+/*! Check for any previously occurred uncaught CUDA error
+   -- aimed at use after kernel calls. */
 #define CU_LAUNCH_ERR(msg) \
     do { \
         cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \
@@ -89,8 +89,8 @@
         } \
     } while (0)
 
-/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error 
-  -- aimed at use after kernel calls. */
+/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error
+   -- aimed at use after kernel calls. */
 #define CU_LAUNCH_ERR_SYNC(msg) \
     do { \
         cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \
@@ -106,7 +106,7 @@
 #define CU_LAUNCH_ERR(msg)      do { } while (0)
 #define CU_LAUNCH_ERR_SYNC(msg) do { } while (0)
 
-#endif /* CHECK_CUDA_ERRORS */ 
+#endif /* CHECK_CUDA_ERRORS */
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu b/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu
index 4e2933dd46..2566f9c418 100644
--- a/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu
+++ b/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu
@@ -40,8 +40,8 @@
 #include "cudautils.cuh"
 #include "pmalloc_cuda.h"
 
-/*! Allocates nbytes of page-locked memory. 
- *  This memory should always be freed using pfree (or with the page-locked 
+/*! Allocates nbytes of page-locked memory.
+ *  This memory should always be freed using pfree (or with the page-locked
  *  free functions provied by the CUDA library).
  */
 void pmalloc(void **h_ptr, size_t nbytes)
@@ -58,13 +58,13 @@ void pmalloc(void **h_ptr, size_t nbytes)
 
     CU_CHECK_PREV_ERR();
 
-    stat = cudaMallocHost(h_ptr, nbytes, flag);    
+    stat = cudaMallocHost(h_ptr, nbytes, flag);
     sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
-    CU_RET_ERR(stat, strbuf);  
+    CU_RET_ERR(stat, strbuf);
 }
 
-/*! Allocates nbytes of page-locked memory with write-combining. 
- *  This memory should always be freed using pfree (or with the page-locked 
+/*! Allocates nbytes of page-locked memory with write-combining.
+ *  This memory should always be freed using pfree (or with the page-locked
  *  free functions provied by the CUDA library).
  */
 void pmalloc_wc(void **h_ptr, size_t nbytes)
@@ -81,21 +81,21 @@ void pmalloc_wc(void **h_ptr, size_t nbytes)
 
     CU_CHECK_PREV_ERR();
 
-    stat = cudaMallocHost(h_ptr, nbytes, flag);    
+    stat = cudaMallocHost(h_ptr, nbytes, flag);
     sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
-    CU_RET_ERR(stat, strbuf);  
+    CU_RET_ERR(stat, strbuf);
 }
 
 /*! Frees page locked memory allocated with pmalloc.
- *  This function can safely be called also with a pointer to a page-locked 
+ *  This function can safely be called also with a pointer to a page-locked
  *  memory allocated directly with CUDA API calls.
  */
-void pfree(void *h_ptr) 
+void pfree(void *h_ptr)
 {
-    cudaError_t stat; 
+    cudaError_t stat;
 
     if (h_ptr == NULL)
-    {        
+    {
         return;
     }
 
diff --git a/src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh b/src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh
index a1413c17b6..c7935510c7 100644
--- a/src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh
+++ b/src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh
@@ -65,15 +65,15 @@ inline __host__ __device__ float3 operator*(float k, float3 a)
 {
     return make_float3(k * a.x, k * a.y, k * a.z);
 }
-inline __host__ __device__ void operator+=(float3 &a, float3 b)
+inline __host__ __device__ void operator += (float3 &a, float3 b)
 {
     a.x += b.x; a.y += b.y; a.z += b.z;
 }
-inline __host__ __device__ void operator+=(float3 &a, float4 b)
+inline __host__ __device__ void operator += (float3 &a, float4 b)
 {
     a.x += b.x; a.y += b.y; a.z += b.z;
 }
-inline __host__ __device__ void operator-=(float3 &a, float3 b)
+inline __host__ __device__ void operator -= (float3 &a, float3 b)
 {
     a.x -= b.x; a.y -= b.y; a.z -= b.z;
 }
@@ -93,7 +93,7 @@ inline __host__ __device__ float3 operator*(float3 a, float3 b)
 {
     return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 }
-inline __host__ __device__ void operator*=(float3 &a, float3 b)
+inline __host__ __device__ void operator *= (float3 &a, float3 b)
 {
     a.x *= b.x; a.y *= b.y; a.z *= b.z;
 }
@@ -130,15 +130,15 @@ inline __host__ __device__ float4 operator*(float4 a, float k)
 {
     return make_float4(k * a.x, k * a.y, k * a.z, k * a.w);
 }
-inline __host__ __device__ void operator+=(float4 &a, float4 b)
+inline __host__ __device__ void operator += (float4 &a, float4 b)
 {
     a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
 }
-inline __host__ __device__ void operator+=(float4 &a, float3 b)
+inline __host__ __device__ void operator += (float4 &a, float3 b)
 {
     a.x += b.x; a.y += b.y; a.z += b.z;
 }
-inline __host__ __device__ void operator-=(float4 &a, float3 b)
+inline __host__ __device__ void operator -= (float4 &a, float3 b)
 {
     a.x -= b.x; a.y -= b.y; a.z -= b.z;
 }
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.c b/src/gromacs/gmxlib/gmx_detect_hardware.c
index 4332d1a701..2e56119e47 100644
--- a/src/gromacs/gmxlib/gmx_detect_hardware.c
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.c
@@ -307,7 +307,7 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
         ngpu_comp = hwinfo->gpu_info.ncuda_dev_compatible;
         ngpu_use  = hw_opt->gpu_opt.ncuda_dev_use;
 
-        sprintf(gpu_comp_plural, "%s", (ngpu_comp> 1) ? "s" : "");
+        sprintf(gpu_comp_plural, "%s", (ngpu_comp > 1) ? "s" : "");
         sprintf(gpu_use_plural,  "%s", (ngpu_use > 1) ? "s" : "");
 
         /* number of tMPI threads auto-adjusted */
diff --git a/src/gromacs/gmxlib/gmx_fatal.c b/src/gromacs/gmxlib/gmx_fatal.c
index 58e980b327..1481bde2ee 100644
--- a/src/gromacs/gmxlib/gmx_fatal.c
+++ b/src/gromacs/gmxlib/gmx_fatal.c
@@ -57,9 +57,9 @@
 
 #include "gromacs/legacyheaders/thread_mpi/threads.h"
 
-static gmx_bool bDebug         = FALSE;
-static char    *fatal_tmp_file = NULL;
-static FILE    *log_file       = NULL;
+static gmx_bool            bDebug         = FALSE;
+static char               *fatal_tmp_file = NULL;
+static FILE               *log_file       = NULL;
 
 static tMPI_Thread_mutex_t debug_mutex     = TMPI_THREAD_MUTEX_INITIALIZER;
 static tMPI_Thread_mutex_t where_mutex     = TMPI_THREAD_MUTEX_INITIALIZER;
@@ -299,10 +299,10 @@ void gmx_fatal_collective(int f_errno, const char *file, int line,
                           const char *fmt, ...)
 {
     gmx_bool    bFinalize;
-    va_list ap;
-    char    msg[STRLEN];
+    va_list     ap;
+    char        msg[STRLEN];
 #ifdef GMX_MPI
-    int     result;
+    int         result;
 #endif
 
     bFinalize = TRUE;
@@ -388,8 +388,8 @@ void _unexpected_eof(const char *fn, int line, const char *srcfn, int srcline)
  * 0 to 3 of these filed are redirected to /dev/null
  *
  */
-FILE *debug           = NULL;
-gmx_bool gmx_debug_at = FALSE;
+FILE    *debug           = NULL;
+gmx_bool gmx_debug_at    = FALSE;
 
 void init_debug(const int dbglevel, const char *dbgfile)
 {
@@ -454,7 +454,7 @@ void doexceptions(void)
 
 static const char *gmxuser = "Please report this to the mailing list (gmx-users@gromacs.org)";
 
-static void (*gmx_error_handler)(const char *msg) = quit_gmx;
+static void        (*gmx_error_handler)(const char *msg) = quit_gmx;
 
 void set_gmx_error_handler(void (*func)(const char *msg))
 {
@@ -483,8 +483,8 @@ char *gmx_strerror(const char *key)
         { "range",  "Range checking error" }
     };
 #define NMSG asize(msg)
-    char buf[1024];
-    size_t i;
+    char        buf[1024];
+    size_t      i;
 
     if (key == NULL)
     {
@@ -514,9 +514,9 @@ char *gmx_strerror(const char *key)
 
 void _gmx_error(const char *key, const char *msg, const char *file, int line)
 {
-    char buf[10240], errerrbuf[1024];
+    char        buf[10240], errerrbuf[1024];
     const char *llines = "-------------------------------------------------------";
-    char *strerr;
+    char       *strerr;
 
     /* protect the audience from suggestive discussions */
 
@@ -564,7 +564,7 @@ void _range_check(int n, int n_min, int n_max, const char *warn_str,
 void gmx_warning(const char *fmt, ...)
 {
     va_list ap;
-    char msg[STRLEN];
+    char    msg[STRLEN];
 
     va_start(ap, fmt);
     vsprintf(msg, fmt, ap);
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
index 03d9ac2ea9..5c80ac2f2d 100644
--- a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
@@ -45,57 +45,67 @@
 #include "../cuda_tools/cudautils.cuh"
 #include "memtestG80_core.h"
 
+/** Amount of memory to be used in quick memtest. */
+#define QUICK_MEM       250
+/** Bit flag with type of tests to run in quick memtest. */
+#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
+/** Number of iterations in quick memtest. */
+#define QUICK_ITER      3
 
-#define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
-#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
-                                                                            to run in quick memtest. */
-#define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
+/** Bitflag with all test set on for full memetest. */
+#define FULL_TESTS      0x3FFF
+/** Number of iterations in full memtest. */
+#define FULL_ITER       25
 
-#define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
-#define FULL_ITER       25 /*!< Number of iterations in full memtest. */
+/** Bit flag with type of tests to run in time constrained memtest. */
+#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
 
-#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
-                                                                            run in time constrained memtest. */
-
-static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
-                                           In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
+/*! \brief
+ * Max number of devices supported by CUDA (for consistency checking).
+ *
+ * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
+ */
+static int cuda_max_device_count = 32;
 
-/*! Dummy kernel used for sanity checking. */
-__global__ void k_dummy_test(){}
+/** Dummy kernel used for sanity checking. */
+__global__ void k_dummy_test()
+{
+}
 
 
-/*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
+/** Bit-flags which refer to memtestG80 test types and are used in do_memtest
+ * to specify which tests to run. */
 enum memtest_G80_test_types {
-    MOVING_INVERSIONS_10 =      0x1,
-    MOVING_INVERSIONS_RAND =    0x2,
-    WALKING_8BIT_M86 =          0x4,
-    WALKING_0_8BIT =            0x8,
-    WALKING_1_8BIT =            0x10,
-    WALKING_0_32BIT =           0x20,
-    WALKING_1_32BIT =           0x40,
-    RANDOM_BLOCKS =             0x80,
-    MOD_20_32BIT =              0x100,
-    LOGIC_1_ITER =              0x200,
-    LOGIC_4_ITER =              0x400,
-    LOGIC_1_ITER_SHMEM =        0x800,
-    LOGIC_4_ITER_SHMEM =        0x1000
+    MOVING_INVERSIONS_10   = 0x1,
+    MOVING_INVERSIONS_RAND = 0x2,
+    WALKING_8BIT_M86       = 0x4,
+    WALKING_0_8BIT         = 0x8,
+    WALKING_1_8BIT         = 0x10,
+    WALKING_0_32BIT        = 0x20,
+    WALKING_1_32BIT        = 0x40,
+    RANDOM_BLOCKS          = 0x80,
+    MOD_20_32BIT           = 0x100,
+    LOGIC_1_ITER           = 0x200,
+    LOGIC_4_ITER           = 0x400,
+    LOGIC_1_ITER_SHMEM     = 0x800,
+    LOGIC_4_ITER_SHMEM     = 0x1000
 };
 
 
-/*! 
-  * \brief Runs GPU sanity checks.
-  *
-  * Runs a series of checks to determine that the given GPU and underlying CUDA
-  * driver/runtime functions properly.
-  * Returns properties of a device with given ID or the one that has
-  * already been initialized earlier in the case if of \dev_id == -1.
-  *
-  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
-  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
-  * \returns                0 if the device looks OK
-  *
-  * TODO: introduce errors codes and handle errors more smoothly.
-  */
+/*!
+ * \brief Runs GPU sanity checks.
+ *
+ * Runs a series of checks to determine that the given GPU and underlying CUDA
+ * driver/runtime functions properly.
+ * Returns properties of a device with given ID or the one that has
+ * already been initialized earlier in the case if of \dev_id == -1.
+ *
+ * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
+ * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
+ * \returns                0 if the device looks OK
+ *
+ * TODO: introduce errors codes and handle errors more smoothly.
+ */
 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 {
     cudaError_t cu_err;
@@ -104,18 +114,22 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
     cu_err = cudaGetDeviceCount(&dev_count);
     if (cu_err != cudaSuccess)
     {
-       fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
-               cudaGetErrorString(cu_err));
+        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
+                cudaGetErrorString(cu_err));
         return -1;
     }
 
     /* no CUDA compatible device at all */
     if (dev_count == 0)
+    {
         return -1;
+    }
 
     /* things might go horribly wrong if cudart is not compatible with the driver */
     if (dev_count < 0 || dev_count > cuda_max_device_count)
+    {
         return -1;
+    }
 
     if (dev_id == -1) /* device already selected let's not destroy the context */
     {
@@ -149,10 +163,14 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 
     /* both major & minor is 9999 if no CUDA capable devices are present */
     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
+    {
         return -1;
+    }
     /* we don't care about emulation mode */
     if (dev_prop->major == 0)
+    {
         return -1;
+    }
 
     if (id != -1)
     {
@@ -166,7 +184,7 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
     }
 
     /* try to execute a dummy kernel */
-    k_dummy_test<<<1, 512>>>();
+    k_dummy_test<<< 1, 512>>> ();
     if (cudaThreadSynchronize() != cudaSuccess)
     {
         return -1;
@@ -212,7 +230,9 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
 
     // let's try to allocate the mem
     while (!tester.allocate(megs) && (megs - 10 > 0))
-        { megs -= 10; tester.deallocate(); }
+    {
+        megs -= 10; tester.deallocate();
+    }
 
     if (megs <= 10)
     {
@@ -229,25 +249,31 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
         {
             tester.gpuMovingInversionsOnesZeros(err_count);
             if (err_count > 0)
+            {
                 return MOVING_INVERSIONS_10;
+            }
         }
         // Moving Inversions (random)
         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
         {
             tester.gpuMovingInversionsRandom(err_count);
             if (err_count > 0)
+            {
                 return MOVING_INVERSIONS_RAND;
+            }
         }
-       // Memtest86 Walking 8-bit
+        // Memtest86 Walking 8-bit
         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
         {
             for (uint shift = 0; shift < 8; shift++)
             {
                 tester.gpuWalking8BitM86(err_count, shift);
                 if (err_count > 0)
+                {
                     return WALKING_8BIT_M86;
+                }
             }
-      }
+        }
         // True Walking zeros (8-bit)
         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
         {
@@ -255,7 +281,9 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
             {
                 tester.gpuWalking8Bit(err_count, false, shift);
                 if (err_count > 0)
+                {
                     return WALKING_0_8BIT;
+                }
             }
         }
         // True Walking ones (8-bit)
@@ -265,7 +293,9 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
             {
                 tester.gpuWalking8Bit(err_count, true, shift);
                 if (err_count > 0)
+                {
                     return WALKING_1_8BIT;
+                }
             }
         }
         // Memtest86 Walking zeros (32-bit)
@@ -275,25 +305,31 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
             {
                 tester.gpuWalking32Bit(err_count, false, shift);
                 if (err_count > 0)
+                {
                     return WALKING_0_32BIT;
+                }
             }
         }
-       // Memtest86 Walking ones (32-bit)
+        // Memtest86 Walking ones (32-bit)
         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
         {
             for (uint shift = 0; shift < 32; shift++)
             {
                 tester.gpuWalking32Bit(err_count, true, shift);
                 if (err_count > 0)
+                {
                     return WALKING_1_32BIT;
+                }
             }
-       }
+        }
         // Random blocks
         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
         {
-            tester.gpuRandomBlocks(err_count,rand());
+            tester.gpuRandomBlocks(err_count, rand());
             if (err_count > 0)
+            {
                 return RANDOM_BLOCKS;
+            }
 
         }
 
@@ -304,37 +340,47 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
             {
                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
                 if (err_count > 0)
+                {
                     return MOD_20_32BIT;
+                }
             }
         }
         // Logic (one iteration)
         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
         {
-            tester.gpuShortLCG0(err_count,1);
+            tester.gpuShortLCG0(err_count, 1);
             if (err_count > 0)
+            {
                 return LOGIC_1_ITER;
+            }
         }
         // Logic (4 iterations)
         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
         {
-            tester.gpuShortLCG0(err_count,4);
+            tester.gpuShortLCG0(err_count, 4);
             if (err_count > 0)
+            {
                 return LOGIC_4_ITER;
+            }
 
         }
         // Logic (shared memory, one iteration)
         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
         {
-            tester.gpuShortLCG0Shmem(err_count,1);
+            tester.gpuShortLCG0Shmem(err_count, 1);
             if (err_count > 0)
+            {
                 return LOGIC_1_ITER_SHMEM;
+            }
         }
         // Logic (shared-memory, 4 iterations)
         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
         {
-            tester.gpuShortLCG0Shmem(err_count,4);
+            tester.gpuShortLCG0Shmem(err_count, 4);
             if (err_count > 0)
+            {
                 return LOGIC_4_ITER_SHMEM;
+            }
         }
     }
 
@@ -353,9 +399,12 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
 int do_quick_memtest(int dev_id)
 {
     cudaDeviceProp  dev_prop;
-    int             devmem, res, time=0;
+    int             devmem, res, time = 0;
 
-    if (debug) { time = getTimeMilliseconds(); }
+    if (debug)
+    {
+        time = getTimeMilliseconds();
+    }
 
     if (do_sanity_checks(dev_id, &dev_prop) != 0)
     {
@@ -367,7 +416,7 @@ int do_quick_memtest(int dev_id)
     {
         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
-            QUICK_MEM, devmem, QUICK_ITER);
+                QUICK_MEM, devmem, QUICK_ITER);
     }
 
     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
@@ -379,7 +428,10 @@ int do_quick_memtest(int dev_id)
     }
 
     /* destroy context only if we created it */
-    if (dev_id !=-1) cudaThreadExit();
+    if (dev_id != -1)
+    {
+        cudaThreadExit();
+    }
     return res;
 }
 
@@ -395,9 +447,12 @@ int do_quick_memtest(int dev_id)
 int do_full_memtest(int dev_id)
 {
     cudaDeviceProp  dev_prop;
-    int             devmem, res, time=0;
+    int             devmem, res, time = 0;
 
-    if (debug) { time = getTimeMilliseconds(); }
+    if (debug)
+    {
+        time = getTimeMilliseconds();
+    }
 
     if (do_sanity_checks(dev_id, &dev_prop) != 0)
     {
@@ -407,10 +462,10 @@ int do_full_memtest(int dev_id)
 
     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 
-    if (debug) 
-    { 
+    if (debug)
+    {
         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
-            devmem, devmem, FULL_ITER); 
+                devmem, devmem, FULL_ITER);
     }
 
     /* do all test on the entire memory */
@@ -423,7 +478,10 @@ int do_full_memtest(int dev_id)
     }
 
     /* destroy context only if we created it */
-    if (dev_id != -1) cudaThreadExit();
+    if (dev_id != -1)
+    {
+        cudaThreadExit();
+    }
     return res;
 }
 
@@ -440,12 +498,15 @@ int do_full_memtest(int dev_id)
 int do_timed_memtest(int dev_id, int time_constr)
 {
     cudaDeviceProp  dev_prop;
-    int             devmem, res=0, time=0, startt;
+    int             devmem, res = 0, time = 0, startt;
 
-    if (debug) { time = getTimeMilliseconds(); }
+    if (debug)
+    {
+        time = getTimeMilliseconds();
+    }
 
     time_constr *= 1000;  /* convert to ms for convenience */
-    startt = getTimeMilliseconds();
+    startt       = getTimeMilliseconds();
 
     if (do_sanity_checks(dev_id, &dev_prop) != 0)
     {
@@ -455,18 +516,21 @@ int do_timed_memtest(int dev_id, int time_constr)
 
     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 
-    if (debug) 
-    { 
+    if (debug)
+    {
         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
-        devmem, devmem, time_constr); 
+                devmem, devmem, time_constr);
     }
 
-    /* do the TIMED_TESTS set, one step at a time on the entire memory 
+    /* do the TIMED_TESTS set, one step at a time on the entire memory
        that can be allocated, and stop when the given time is exceeded */
     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
-    {        
+    {
         res = do_memtest(TIMED_TESTS, devmem, 1);
-        if (res != 0) break;
+        if (res != 0)
+        {
+            break;
+        }
     }
 
     if (debug)
@@ -476,7 +540,10 @@ int do_timed_memtest(int dev_id, int time_constr)
     }
 
     /* destroy context only if we created it */
-    if (dev_id != -1) cudaThreadExit();
+    if (dev_id != -1)
+    {
+        cudaThreadExit();
+    }
     return res;
 }
 
@@ -497,8 +564,8 @@ gmx_bool init_gpu(int mygpu, char *result_str,
                   const gmx_gpu_opt_t *gpu_opt)
 {
     cudaError_t stat;
-    char sbuf[STRLEN];
-    int gpuid;
+    char        sbuf[STRLEN];
+    int         gpuid;
 
     assert(gpu_info);
     assert(result_str);
@@ -507,7 +574,7 @@ gmx_bool init_gpu(int mygpu, char *result_str,
     {
         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
                 "there are %d %s-selected GPU(s), but #%d was requested.",
-                 gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+                gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
         gmx_incons(sbuf);
     }
 
@@ -643,9 +710,9 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
  */
 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 {
-    int             i, ndev, checkres, retval;
-    cudaError_t     stat;
-    cudaDeviceProp  prop;
+    int              i, ndev, checkres, retval;
+    cudaError_t      stat;
+    cudaDeviceProp   prop;
     cuda_dev_info_t *devs;
 
     assert(gpu_info);
@@ -666,7 +733,7 @@ int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
          * busy in exclusive mode, or some other condition which should
          * result in us issuing a warning a falling back to CPUs. */
         retval = -1;
-        s = cudaGetErrorString(stat);
+        s      = cudaGetErrorString(stat);
         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
     }
     else
@@ -708,9 +775,9 @@ int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  * \param[in,out] gpu_opt     pointer to structure holding GPU options
  */
 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
-                          gmx_gpu_opt_t *gpu_opt)
+                          gmx_gpu_opt_t        *gpu_opt)
 {
-    int i, ncompat;
+    int  i, ncompat;
     int *compat;
 
     assert(gpu_info);
@@ -746,11 +813,11 @@ void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
  * \param[out]  gpu_opt     pointer to structure holding GPU options
  * \returns                 TRUE if every the requested GPUs are compatible
  */
-gmx_bool check_selected_cuda_gpus(int *checkres,
+gmx_bool check_selected_cuda_gpus(int                  *checkres,
                                   const gmx_gpu_info_t *gpu_info,
-                                  gmx_gpu_opt_t *gpu_opt)
+                                  gmx_gpu_opt_t        *gpu_opt)
 {
-    int i, id;
+    int  i, id;
     bool bAllOk;
 
     assert(checkres);
@@ -820,7 +887,7 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
 
     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 
-    bool bGpuExists =
+    bool             bGpuExists =
         dinfo->stat == egpuCompatible ||
         dinfo->stat == egpuIncompatible;
 
@@ -852,8 +919,8 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
  * \returns                 device ID of the requested GPU
  */
 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
-                      const gmx_gpu_opt_t *gpu_opt,
-                      int idx)
+                      const gmx_gpu_opt_t  *gpu_opt,
+                      int                   idx)
 {
     assert(gpu_info);
     assert(gpu_opt);
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h
index 0be0994510..6f01d2dfef 100644
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h
@@ -38,30 +38,30 @@
 /* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
 #include <emmintrin.h>
 
-#define GMX_FJSP_SHUFFLE2(x,y) (((x)<<1) | (y))
+#define GMX_FJSP_SHUFFLE2(x, y) (((x)<<1) | (y))
 
 #define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) {           \
-    _fjsp_v2r8 __gmx_t1 = row0;                          \
-    row0           = _fjsp_unpacklo_v2r8(row0,row1);     \
-    row1           = _fjsp_unpackhi_v2r8(__gmx_t1,row1); \
+        _fjsp_v2r8 __gmx_t1 = row0;                          \
+        row0           = _fjsp_unpacklo_v2r8(row0, row1);     \
+        row1           = _fjsp_unpackhi_v2r8(__gmx_t1, row1); \
 }
 
 
 static void
 gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
 {
-  double lo,hi;
+    double lo, hi;
 
-  _fjsp_storel_v2r8(&lo,a);
-  _fjsp_storeh_v2r8(&hi,a);
-  printf("%s: %g %g\n",s,lo,hi);
+    _fjsp_storel_v2r8(&lo, a);
+    _fjsp_storeh_v2r8(&hi, a);
+    printf("%s: %g %g\n", s, lo, hi);
 }
 
 
 static _fjsp_v2r8
 gmx_fjsp_set1_v2r8(double d)
 {
-    return _fjsp_set_v2r8(d,d);
+    return _fjsp_set_v2r8(d, d);
 }
 
 static _fjsp_v2r8
@@ -80,10 +80,10 @@ gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
         long long int    i;
     }
     conv;
-    
-    a = _fjsp_cmplt_v2r8(a,b);
-    a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a,a));
-    _fjsp_storel_v2r8(&(conv.d),a);
+
+    a = _fjsp_cmplt_v2r8(a, b);
+    a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a, a));
+    _fjsp_storel_v2r8(&(conv.d), a);
     return (conv.i != 0);
 }
 
@@ -93,19 +93,19 @@ gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
 {
     const _fjsp_v2r8 half  = gmx_fjsp_set1_v2r8(0.5);
     const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
-    _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
-    
-    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+    _fjsp_v2r8       lu    = _fjsp_rsqrta_v2r8(x);
+
+    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
     /* The HPC-ACE instruction set is only available in double precision, while
-     * single precision is typically sufficient for Gromacs. If you define 
-     * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson 
-     * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full 
+     * single precision is typically sufficient for Gromacs. If you define
+     * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
+     * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
      * double precision (53 bits). This is still clearly higher than single precision (24 bits).
      */
 #ifndef GMX_RELAXED_DOUBLE_PRECISION
-    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
 #endif
-    return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+    return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
 }
 
 
@@ -113,11 +113,11 @@ gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
 static gmx_inline _fjsp_v2r8
 gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
 {
-    const _fjsp_v2r8 two  = gmx_fjsp_set1_v2r8(2.0);    
-    __m128d lu = _fjsp_rcpa_v2r8(x);
-    
+    const _fjsp_v2r8 two  = gmx_fjsp_set1_v2r8(2.0);
+    __m128d          lu   = _fjsp_rcpa_v2r8(x);
+
     /* Perform three N-R steps for double precision */
-    lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+    lu         = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
     /* The HPC-ACE instruction set is only available in double precision, while
      * single precision is typically sufficient for Gromacs. If you define
      * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
@@ -125,20 +125,20 @@ gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
      * double precision (53 bits). This is still clearly higher than single precision (24 bits).
      */
 #ifndef GMX_RELAXED_DOUBLE_PRECISION
-    lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+    lu         = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
 #endif
-    return _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+    return _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
 }
 
 
 static gmx_inline _fjsp_v2r8
 gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
 {
-    return _fjsp_madd_v2r8(dx,dx,_fjsp_madd_v2r8(dy,dy,_fjsp_mul_v2r8(dz,dz)));
+    return _fjsp_madd_v2r8(dx, dx, _fjsp_madd_v2r8(dy, dy, _fjsp_mul_v2r8(dz, dz)));
 }
 
 /* Normal sum of four ymm registers */
-#define gmx_fjsp_sum4_v2r8(t0,t1,t2,t3)  _fjsp_add_v2r8(_fjsp_add_v2r8(t0,t1),_fjsp_add_v2r8(t2,t3))
+#define gmx_fjsp_sum4_v2r8(t0, t1, t2, t3)  _fjsp_add_v2r8(_fjsp_add_v2r8(t0, t1), _fjsp_add_v2r8(t2, t3))
 
 
 
@@ -148,263 +148,263 @@ static _fjsp_v2r8
 gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
                                  const double * gmx_restrict ptrB)
 {
-    return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA),_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
+    return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA), _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
 }
 
 static _fjsp_v2r8
 gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
 {
-    return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+    return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
 }
 
 
 static void
 gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
-                                double * gmx_restrict ptrB,
-                                _fjsp_v2r8 xmm1)
+                                  double * gmx_restrict ptrB,
+                                  _fjsp_v2r8            xmm1)
 {
     _fjsp_v2r8 t2;
-    
-    t2       = _fjsp_unpackhi_v2r8(xmm1,xmm1);
-    _fjsp_storel_v2r8(ptrA,xmm1);                                           
-    _fjsp_storel_v2r8(ptrB,t2);                                         
+
+    t2       = _fjsp_unpackhi_v2r8(xmm1, xmm1);
+    _fjsp_storel_v2r8(ptrA, xmm1);
+    _fjsp_storel_v2r8(ptrB, t2);
 }
 
 static void
 gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
 {
-    _fjsp_storel_v2r8(ptrA,xmm1);
+    _fjsp_storel_v2r8(ptrA, xmm1);
 }
 
 
 /* Similar to store, but increments value in memory */
 static void
 gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
-                                    double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
+                                      double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
 {
     _fjsp_v2r8 t1;
-    
-    t1   = _fjsp_unpackhi_v2r8(xmm1,xmm1);
-    xmm1 = _fjsp_add_v2r8(xmm1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA));
-    t1   = _fjsp_add_v2r8(t1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
-    _fjsp_storel_v2r8(ptrA,xmm1);
-    _fjsp_storel_v2r8(ptrB,t1);
+
+    t1   = _fjsp_unpackhi_v2r8(xmm1, xmm1);
+    xmm1 = _fjsp_add_v2r8(xmm1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA));
+    t1   = _fjsp_add_v2r8(t1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
+    _fjsp_storel_v2r8(ptrA, xmm1);
+    _fjsp_storel_v2r8(ptrB, t1);
 }
 
 static void
 gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
 {
     _fjsp_v2r8 tmp;
-    
-    tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
-    tmp = _fjsp_add_v2r8(tmp,xmm1);
-    _fjsp_storel_v2r8(ptrA,tmp);
+
+    tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
+    tmp = _fjsp_add_v2r8(tmp, xmm1);
+    _fjsp_storel_v2r8(ptrA, tmp);
 }
 
 
 
 static gmx_inline void
 gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
-                             const double * gmx_restrict p2,
-                             _fjsp_v2r8 * gmx_restrict c6,
-                             _fjsp_v2r8 * gmx_restrict c12)
+                                 const double * gmx_restrict p2,
+                                 _fjsp_v2r8 * gmx_restrict   c6,
+                                 _fjsp_v2r8 * gmx_restrict   c12)
 {
-    _fjsp_v2r8 t1,t2,t3;
-    
+    _fjsp_v2r8 t1, t2, t3;
+
     /* The c6/c12 array should be aligned */
     t1   = _fjsp_load_v2r8(p1);
     t2   = _fjsp_load_v2r8(p2);
-    *c6  = _fjsp_unpacklo_v2r8(t1,t2);  
-    *c12 = _fjsp_unpackhi_v2r8(t1,t2);                    
+    *c6  = _fjsp_unpacklo_v2r8(t1, t2);
+    *c12 = _fjsp_unpackhi_v2r8(t1, t2);
 }
 
 static gmx_inline void
 gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
-                             _fjsp_v2r8 * gmx_restrict c6,
-                             _fjsp_v2r8 * gmx_restrict c12)
+                                 _fjsp_v2r8 * gmx_restrict   c6,
+                                 _fjsp_v2r8 * gmx_restrict   c12)
 {
-    *c6     = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
-    *c12    = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+    *c6     = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
+    *c12    = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
 }
 
 
 static gmx_inline void
 gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         _fjsp_v2r8 * gmx_restrict x1,
-                                         _fjsp_v2r8 * gmx_restrict y1,
-                                         _fjsp_v2r8 * gmx_restrict z1)
+                                             const double * gmx_restrict xyz,
+                                             _fjsp_v2r8 * gmx_restrict   x1,
+                                             _fjsp_v2r8 * gmx_restrict   y1,
+                                             _fjsp_v2r8 * gmx_restrict   z1)
 {
-    _fjsp_v2r8 mem_xy,mem_z,mem_sxy,mem_sz;
-    
+    _fjsp_v2r8 mem_xy, mem_z, mem_sxy, mem_sz;
+
     mem_xy  = _fjsp_load_v2r8(xyz);
-    mem_z   = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+2);
+    mem_z   = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+2);
     mem_sxy = _fjsp_load_v2r8(xyz_shift);
-    mem_sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
-    
-    mem_xy  = _fjsp_add_v2r8(mem_xy,mem_sxy);
-    mem_z   = _fjsp_add_v2r8(mem_z,mem_sz);
-    
-    *x1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(0,0));
-    *y1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(1,1));
-    *z1  = _fjsp_shuffle_v2r8(mem_z,mem_z,GMX_FJSP_SHUFFLE2(0,0));
+    mem_sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
+
+    mem_xy  = _fjsp_add_v2r8(mem_xy, mem_sxy);
+    mem_z   = _fjsp_add_v2r8(mem_z, mem_sz);
+
+    *x1  = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(0, 0));
+    *y1  = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(1, 1));
+    *z1  = _fjsp_shuffle_v2r8(mem_z, mem_z, GMX_FJSP_SHUFFLE2(0, 0));
 }
 
 
 static gmx_inline void
 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                         _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                         _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+                                             const double * gmx_restrict xyz,
+                                             _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                             _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                             _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
 {
-    _fjsp_v2r8 t1,t2,t3,t4,t5,sxy,sz,szx,syz;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5, sxy, sz, szx, syz;
+
     t1  = _fjsp_load_v2r8(xyz);
     t2  = _fjsp_load_v2r8(xyz+2);
     t3  = _fjsp_load_v2r8(xyz+4);
     t4  = _fjsp_load_v2r8(xyz+6);
-    t5  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+8);
-    
+    t5  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+8);
+
     sxy = _fjsp_load_v2r8(xyz_shift);
-    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
-    szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
-    syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
-    
-    t1  = _fjsp_add_v2r8(t1,sxy);
-    t2  = _fjsp_add_v2r8(t2,szx);
-    t3  = _fjsp_add_v2r8(t3,syz);
-    t4  = _fjsp_add_v2r8(t4,sxy);
-    t5  = _fjsp_add_v2r8(t5,sz);
-    
-    *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
-    *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
-    *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
-    *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
-    *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
-    *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
-    *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
-    *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
-    *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
+    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
+    szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
+    syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
+
+    t1  = _fjsp_add_v2r8(t1, sxy);
+    t2  = _fjsp_add_v2r8(t2, szx);
+    t3  = _fjsp_add_v2r8(t3, syz);
+    t4  = _fjsp_add_v2r8(t4, sxy);
+    t5  = _fjsp_add_v2r8(t5, sz);
+
+    *x1  = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
+    *y1  = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
+    *z1  = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
+    *x2  = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
+    *y2  = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
+    *z2  = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
+    *x3  = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
+    *y3  = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
+    *z3  = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
 }
 
 
 static gmx_inline void
 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                         _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                         _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
-                                         _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
-{
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
-    
+                                             const double * gmx_restrict xyz,
+                                             _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                             _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                             _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                             _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
+
     t1  = _fjsp_load_v2r8(xyz);
     t2  = _fjsp_load_v2r8(xyz+2);
     t3  = _fjsp_load_v2r8(xyz+4);
     t4  = _fjsp_load_v2r8(xyz+6);
     t5  = _fjsp_load_v2r8(xyz+8);
     t6  = _fjsp_load_v2r8(xyz+10);
-    
+
     sxy = _fjsp_load_v2r8(xyz_shift);
-    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
-    szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
-    syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
-    
-    t1  = _fjsp_add_v2r8(t1,sxy);
-    t2  = _fjsp_add_v2r8(t2,szx);
-    t3  = _fjsp_add_v2r8(t3,syz);
-    t4  = _fjsp_add_v2r8(t4,sxy);
-    t5  = _fjsp_add_v2r8(t5,szx);
-    t6  = _fjsp_add_v2r8(t6,syz);
-    
-    *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
-    *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
-    *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
-    *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
-    *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
-    *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
-    *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
-    *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
-    *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
-    *x4  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(1,1));
-    *y4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(0,0));
-    *z4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(1,1));
+    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
+    szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
+    syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
+
+    t1  = _fjsp_add_v2r8(t1, sxy);
+    t2  = _fjsp_add_v2r8(t2, szx);
+    t3  = _fjsp_add_v2r8(t3, syz);
+    t4  = _fjsp_add_v2r8(t4, sxy);
+    t5  = _fjsp_add_v2r8(t5, szx);
+    t6  = _fjsp_add_v2r8(t6, syz);
+
+    *x1  = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
+    *y1  = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
+    *z1  = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
+    *x2  = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
+    *y2  = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
+    *z2  = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
+    *x3  = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
+    *y3  = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
+    *z3  = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
+    *x4  = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(1, 1));
+    *y4  = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(0, 0));
+    *z4  = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(1, 1));
 }
 
 
 
 static gmx_inline void
 gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
-                                  _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
+                                      _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
 {
-	 *x            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
-     *y            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
-     *z            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+    *x            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
+    *y            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
+    *z            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
 }
 
 static gmx_inline void
 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                      _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                      _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
 {
-	 *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
-     *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
-     *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
-	 *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
-     *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
-     *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
-	 *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
-     *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
-     *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
+    *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
+    *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
+    *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
+    *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
+    *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
+    *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
+    *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
+    *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
+    *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
 }
 
 static gmx_inline void
 gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
-                                  _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
-{
-    *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
-    *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
-    *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
-    *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
-    *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
-    *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
-    *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
-    *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
-    *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
-    *x4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+9);
-    *y4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+10);
-    *z4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+11);
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                      _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                      _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                      _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
+    *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
+    *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
+    *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
+    *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
+    *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
+    *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
+    *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
+    *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
+    *x4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+9);
+    *y4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+10);
+    *z4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+11);
 }
 
 
 static gmx_inline void
 gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
-                                  const double * gmx_restrict ptrB,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
+                                      const double * gmx_restrict ptrB,
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
 {
-    _fjsp_v2r8 t1,t2,t3,t4;
+    _fjsp_v2r8 t1, t2, t3, t4;
     t1           = _fjsp_load_v2r8(ptrA);
     t2           = _fjsp_load_v2r8(ptrB);
-    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
-    t4           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
-    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
+    t4           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
     *x1          = t1;
     *y1          = t2;
-    *z1          = _fjsp_unpacklo_v2r8(t3,t4);
+    *z1          = _fjsp_unpacklo_v2r8(t3, t4);
 }
 
 static gmx_inline void
 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                      _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                      _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
 {
-_fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
     t1           = _fjsp_load_v2r8(ptrA);
     t2           = _fjsp_load_v2r8(ptrB);
     t3           = _fjsp_load_v2r8(ptrA+2);
@@ -413,12 +413,12 @@ _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     t6           = _fjsp_load_v2r8(ptrB+4);
     t7           = _fjsp_load_v2r8(ptrA+6);
     t8           = _fjsp_load_v2r8(ptrB+6);
-    t9           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
-    t10          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
-    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
-    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
-    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
-    GMX_FJSP_TRANSPOSE2_V2R8(t7,t8);
+    t9           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
+    t10          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
+    GMX_FJSP_TRANSPOSE2_V2R8(t7, t8);
     *x1          = t1;
     *y1          = t2;
     *z1          = t3;
@@ -427,27 +427,27 @@ _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
     *z2          = t6;
     *x3          = t7;
     *y3          = t8;
-    *z3          = _fjsp_unpacklo_v2r8(t9,t10);
+    *z3          = _fjsp_unpacklo_v2r8(t9, t10);
 }
 
 
 static gmx_inline void
 gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
-                                  _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                      _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                      _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                      _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
 {
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
     t1           = _fjsp_load_v2r8(ptrA);
     t2           = _fjsp_load_v2r8(ptrB);
     t3           = _fjsp_load_v2r8(ptrA+2);
     t4           = _fjsp_load_v2r8(ptrB+2);
     t5           = _fjsp_load_v2r8(ptrA+4);
     t6           = _fjsp_load_v2r8(ptrB+4);
-    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
-    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
-    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
     *x1          = t1;
     *y1          = t2;
     *z1          = t3;
@@ -460,9 +460,9 @@ gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const do
     t4           = _fjsp_load_v2r8(ptrB+8);
     t5           = _fjsp_load_v2r8(ptrA+10);
     t6           = _fjsp_load_v2r8(ptrB+10);
-    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
-    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
-    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
     *x3          = t1;
     *y3          = t2;
     *z3          = t3;
@@ -474,250 +474,250 @@ gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const do
 
 static void
 gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
-{
-    _fjsp_v2r8 t1,t2,t3;
-    
-    t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
-    t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
-    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
-    
-    t1           = _fjsp_sub_v2r8(t1,x1);
-    t2           = _fjsp_sub_v2r8(t2,y1);
-    t3           = _fjsp_sub_v2r8(t3,z1);
-    _fjsp_storel_v2r8(ptrA,t1);
-    _fjsp_storel_v2r8(ptrA+1,t2);
-    _fjsp_storel_v2r8(ptrA+2,t3);
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+{
+    _fjsp_v2r8 t1, t2, t3;
+
+    t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
+    t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
+
+    t1           = _fjsp_sub_v2r8(t1, x1);
+    t2           = _fjsp_sub_v2r8(t2, y1);
+    t3           = _fjsp_sub_v2r8(t3, z1);
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storel_v2r8(ptrA+1, t2);
+    _fjsp_storel_v2r8(ptrA+2, t3);
 }
 
 static void
 gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
-					   _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+                                               _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
 {
-  _fjsp_v2r8 t1,t2,t3;
+    _fjsp_v2r8 t1, t2, t3;
 
-  t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
-  t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
-  t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
+    t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
 
-  t1           = _fjsp_nmsub_v2r8(fscal,dx1,t1);
-  t2           = _fjsp_nmsub_v2r8(fscal,dy1,t2);
-  t3           = _fjsp_nmsub_v2r8(fscal,dz1,t3);
-  _fjsp_storel_v2r8(ptrA,t1);
-  _fjsp_storel_v2r8(ptrA+1,t2);
-  _fjsp_storel_v2r8(ptrA+2,t3);
+    t1           = _fjsp_nmsub_v2r8(fscal, dx1, t1);
+    t2           = _fjsp_nmsub_v2r8(fscal, dy1, t2);
+    t3           = _fjsp_nmsub_v2r8(fscal, dz1, t3);
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storel_v2r8(ptrA+1, t2);
+    _fjsp_storel_v2r8(ptrA+2, t3);
 }
 
 
 static void
 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
-                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
-                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                           _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                           _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
 {
-    _fjsp_v2r8 t1,t2,t3,t4,t5;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5;
+
     t1          = _fjsp_load_v2r8(ptrA);
     t2          = _fjsp_load_v2r8(ptrA+2);
     t3          = _fjsp_load_v2r8(ptrA+4);
     t4          = _fjsp_load_v2r8(ptrA+6);
-    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
-    
-    x1          = _fjsp_unpacklo_v2r8(x1,y1);
-    z1          = _fjsp_unpacklo_v2r8(z1,x2);
-    y2          = _fjsp_unpacklo_v2r8(y2,z2);
-    x3          = _fjsp_unpacklo_v2r8(x3,y3);
+    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
+
+    x1          = _fjsp_unpacklo_v2r8(x1, y1);
+    z1          = _fjsp_unpacklo_v2r8(z1, x2);
+    y2          = _fjsp_unpacklo_v2r8(y2, z2);
+    x3          = _fjsp_unpacklo_v2r8(x3, y3);
     /* nothing to be done for z3 */
-    
-    t1          = _fjsp_sub_v2r8(t1,x1);
-    t2          = _fjsp_sub_v2r8(t2,z1);
-    t3          = _fjsp_sub_v2r8(t3,y2);
-    t4          = _fjsp_sub_v2r8(t4,x3);
-    t5          = _fjsp_sub_v2r8(t5,z3);
-    _fjsp_storel_v2r8(ptrA,t1);
-    _fjsp_storeh_v2r8(ptrA+1,t1);
-    _fjsp_storel_v2r8(ptrA+2,t2);
-    _fjsp_storeh_v2r8(ptrA+3,t2);
-    _fjsp_storel_v2r8(ptrA+4,t3);
-    _fjsp_storeh_v2r8(ptrA+5,t3);
-    _fjsp_storel_v2r8(ptrA+6,t4);
-    _fjsp_storeh_v2r8(ptrA+7,t4);
-    _fjsp_storel_v2r8(ptrA+8,t5);
+
+    t1          = _fjsp_sub_v2r8(t1, x1);
+    t2          = _fjsp_sub_v2r8(t2, z1);
+    t3          = _fjsp_sub_v2r8(t3, y2);
+    t4          = _fjsp_sub_v2r8(t4, x3);
+    t5          = _fjsp_sub_v2r8(t5, z3);
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storeh_v2r8(ptrA+3, t2);
+    _fjsp_storel_v2r8(ptrA+4, t3);
+    _fjsp_storeh_v2r8(ptrA+5, t3);
+    _fjsp_storel_v2r8(ptrA+6, t4);
+    _fjsp_storeh_v2r8(ptrA+7, t4);
+    _fjsp_storel_v2r8(ptrA+8, t5);
 }
 
 
 static void
 gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
-                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
-                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
-                                       _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                           _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                           _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                           _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
 {
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
+
     t1          = _fjsp_load_v2r8(ptrA);
     t2          = _fjsp_load_v2r8(ptrA+2);
     t3          = _fjsp_load_v2r8(ptrA+4);
     t4          = _fjsp_load_v2r8(ptrA+6);
     t5          = _fjsp_load_v2r8(ptrA+8);
     t6          = _fjsp_load_v2r8(ptrA+10);
-    
-    x1          = _fjsp_unpacklo_v2r8(x1,y1);
-    z1          = _fjsp_unpacklo_v2r8(z1,x2);
-    y2          = _fjsp_unpacklo_v2r8(y2,z2);
-    x3          = _fjsp_unpacklo_v2r8(x3,y3);
-    z3          = _fjsp_unpacklo_v2r8(z3,x4);
-    y4          = _fjsp_unpacklo_v2r8(y4,z4);
-    
-    _fjsp_storel_v2r8(ptrA,    _fjsp_sub_v2r8( t1,x1 ));
-    _fjsp_storeh_v2r8(ptrA+1,  _fjsp_sub_v2r8( t1,x1 ));
-    _fjsp_storel_v2r8(ptrA+2,  _fjsp_sub_v2r8( t2,z1 ));
-    _fjsp_storeh_v2r8(ptrA+3,  _fjsp_sub_v2r8( t2,z1 ));
-    _fjsp_storel_v2r8(ptrA+4,  _fjsp_sub_v2r8( t3,y2 ));
-    _fjsp_storeh_v2r8(ptrA+5,  _fjsp_sub_v2r8( t3,y2 ));
-    _fjsp_storel_v2r8(ptrA+6,  _fjsp_sub_v2r8( t4,x3 ));
-    _fjsp_storeh_v2r8(ptrA+7,  _fjsp_sub_v2r8( t4,x3 ));
-    _fjsp_storel_v2r8(ptrA+8,  _fjsp_sub_v2r8( t5,z3 ));
-    _fjsp_storeh_v2r8(ptrA+9,  _fjsp_sub_v2r8( t5,z3 ));
-    _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6,y4 ));
-    _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6,y4 ));
+
+    x1          = _fjsp_unpacklo_v2r8(x1, y1);
+    z1          = _fjsp_unpacklo_v2r8(z1, x2);
+    y2          = _fjsp_unpacklo_v2r8(y2, z2);
+    x3          = _fjsp_unpacklo_v2r8(x3, y3);
+    z3          = _fjsp_unpacklo_v2r8(z3, x4);
+    y4          = _fjsp_unpacklo_v2r8(y4, z4);
+
+    _fjsp_storel_v2r8(ptrA,    _fjsp_sub_v2r8( t1, x1 ));
+    _fjsp_storeh_v2r8(ptrA+1,  _fjsp_sub_v2r8( t1, x1 ));
+    _fjsp_storel_v2r8(ptrA+2,  _fjsp_sub_v2r8( t2, z1 ));
+    _fjsp_storeh_v2r8(ptrA+3,  _fjsp_sub_v2r8( t2, z1 ));
+    _fjsp_storel_v2r8(ptrA+4,  _fjsp_sub_v2r8( t3, y2 ));
+    _fjsp_storeh_v2r8(ptrA+5,  _fjsp_sub_v2r8( t3, y2 ));
+    _fjsp_storel_v2r8(ptrA+6,  _fjsp_sub_v2r8( t4, x3 ));
+    _fjsp_storeh_v2r8(ptrA+7,  _fjsp_sub_v2r8( t4, x3 ));
+    _fjsp_storel_v2r8(ptrA+8,  _fjsp_sub_v2r8( t5, z3 ));
+    _fjsp_storeh_v2r8(ptrA+9,  _fjsp_sub_v2r8( t5, z3 ));
+    _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6, y4 ));
+    _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6, y4 ));
 }
 
 static void
 gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-					   _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
-{
-  _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7;
-    
-  t1          = _fjsp_load_v2r8(ptrA);
-  t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
-  t3          = _fjsp_load_v2r8(ptrB);
-  t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
-    
-  t5          = _fjsp_unpacklo_v2r8(x1,y1);
-  t6          = _fjsp_unpackhi_v2r8(x1,y1);
-  t7          = _fjsp_unpackhi_v2r8(z1,z1);
-    
-  t1          = _fjsp_sub_v2r8(t1,t5);
-  t2          = _fjsp_sub_v2r8(t2,z1);
-    
-  t3          = _fjsp_sub_v2r8(t3,t6);
-  t4          = _fjsp_sub_v2r8(t4,t7);
-    
-  _fjsp_storel_v2r8(ptrA,t1);
-  _fjsp_storeh_v2r8(ptrA+1,t1);
-  _fjsp_storel_v2r8(ptrA+2,t2);
-  _fjsp_storel_v2r8(ptrB,t3);
-  _fjsp_storeh_v2r8(ptrB+1,t3);
-  _fjsp_storel_v2r8(ptrB+2,t4);
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+{
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7;
+
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrB);
+    t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
+
+    t5          = _fjsp_unpacklo_v2r8(x1, y1);
+    t6          = _fjsp_unpackhi_v2r8(x1, y1);
+    t7          = _fjsp_unpackhi_v2r8(z1, z1);
+
+    t1          = _fjsp_sub_v2r8(t1, t5);
+    t2          = _fjsp_sub_v2r8(t2, z1);
+
+    t3          = _fjsp_sub_v2r8(t3, t6);
+    t4          = _fjsp_sub_v2r8(t4, t7);
+
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storel_v2r8(ptrB, t3);
+    _fjsp_storeh_v2r8(ptrB+1, t3);
+    _fjsp_storel_v2r8(ptrB+2, t4);
 }
 
 
 static void
 gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-					       _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+                                               _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
 {
-  _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,fscalA,fscalB;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, fscalA, fscalB;
+
     t1          = _fjsp_load_v2r8(ptrA);
-    t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
     t3          = _fjsp_load_v2r8(ptrB);
-    t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
-    fscalA      = _fjsp_unpacklo_v2r8(fscal,fscal);
-    fscalB      = _fjsp_unpackhi_v2r8(fscal,fscal);
-    
-    t5          = _fjsp_unpacklo_v2r8(dx1,dy1);
-    t6          = _fjsp_unpackhi_v2r8(dx1,dy1);
-    t7          = _fjsp_unpackhi_v2r8(dz1,dz1);
-    
-    t1          = _fjsp_nmsub_v2r8(fscalA,t5,t1);
-    t2          = _fjsp_nmsub_v2r8(fscalA,dz1,t2);
-    
-    t3          = _fjsp_nmsub_v2r8(fscalB,t6,t3);
-    t4          = _fjsp_nmsub_v2r8(fscalB,t7,t4);
-    
-    _fjsp_storel_v2r8(ptrA,t1);
-    _fjsp_storeh_v2r8(ptrA+1,t1);
-    _fjsp_storel_v2r8(ptrA+2,t2);
-    _fjsp_storel_v2r8(ptrB,t3);
-    _fjsp_storeh_v2r8(ptrB+1,t3);
-    _fjsp_storel_v2r8(ptrB+2,t4);
+    t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
+    fscalA      = _fjsp_unpacklo_v2r8(fscal, fscal);
+    fscalB      = _fjsp_unpackhi_v2r8(fscal, fscal);
+
+    t5          = _fjsp_unpacklo_v2r8(dx1, dy1);
+    t6          = _fjsp_unpackhi_v2r8(dx1, dy1);
+    t7          = _fjsp_unpackhi_v2r8(dz1, dz1);
+
+    t1          = _fjsp_nmsub_v2r8(fscalA, t5, t1);
+    t2          = _fjsp_nmsub_v2r8(fscalA, dz1, t2);
+
+    t3          = _fjsp_nmsub_v2r8(fscalB, t6, t3);
+    t4          = _fjsp_nmsub_v2r8(fscalB, t7, t4);
+
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storel_v2r8(ptrB, t3);
+    _fjsp_storeh_v2r8(ptrB+1, t3);
+    _fjsp_storel_v2r8(ptrB+2, t4);
 }
 
 
 static void
 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
-                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
-                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                           _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                           _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
 {
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
-    _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+    _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI;
+
     t1          = _fjsp_load_v2r8(ptrA);
     t2          = _fjsp_load_v2r8(ptrA+2);
     t3          = _fjsp_load_v2r8(ptrA+4);
     t4          = _fjsp_load_v2r8(ptrA+6);
-    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
     t6          = _fjsp_load_v2r8(ptrB);
     t7          = _fjsp_load_v2r8(ptrB+2);
     t8          = _fjsp_load_v2r8(ptrB+4);
     t9          = _fjsp_load_v2r8(ptrB+6);
-    t10         = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
-    
-    tA          = _fjsp_unpacklo_v2r8(x1,y1);
-    tB          = _fjsp_unpackhi_v2r8(x1,y1);
-    tC          = _fjsp_unpacklo_v2r8(z1,x2);
-    tD          = _fjsp_unpackhi_v2r8(z1,x2);
-    tE          = _fjsp_unpacklo_v2r8(y2,z2);
-    tF          = _fjsp_unpackhi_v2r8(y2,z2);
-    tG          = _fjsp_unpacklo_v2r8(x3,y3);
-    tH          = _fjsp_unpackhi_v2r8(x3,y3);
-    tI          = _fjsp_unpackhi_v2r8(z3,z3);
-    
-    t1          = _fjsp_sub_v2r8(t1,tA);
-    t2          = _fjsp_sub_v2r8(t2,tC);
-    t3          = _fjsp_sub_v2r8(t3,tE);
-    t4          = _fjsp_sub_v2r8(t4,tG);
-    t5          = _fjsp_sub_v2r8(t5,z3);
-    
-    t6          = _fjsp_sub_v2r8(t6,tB);
-    t7          = _fjsp_sub_v2r8(t7,tD);
-    t8          = _fjsp_sub_v2r8(t8,tF);
-    t9          = _fjsp_sub_v2r8(t9,tH);
-    t10         = _fjsp_sub_v2r8(t10,tI);
-    
-    _fjsp_storel_v2r8(ptrA,t1);
-    _fjsp_storeh_v2r8(ptrA+1,t1);
-    _fjsp_storel_v2r8(ptrA+2,t2);
-    _fjsp_storeh_v2r8(ptrA+3,t2);
-    _fjsp_storel_v2r8(ptrA+4,t3);
-    _fjsp_storeh_v2r8(ptrA+5,t3);
-    _fjsp_storel_v2r8(ptrA+6,t4);
-    _fjsp_storeh_v2r8(ptrA+7,t4);
-    _fjsp_storel_v2r8(ptrA+8,t5);
-    _fjsp_storel_v2r8(ptrB,t6);
-    _fjsp_storeh_v2r8(ptrB+1,t6);
-    _fjsp_storel_v2r8(ptrB+2,t7);
-    _fjsp_storeh_v2r8(ptrB+3,t7);
-    _fjsp_storel_v2r8(ptrB+4,t8);
-    _fjsp_storeh_v2r8(ptrB+5,t8);
-    _fjsp_storel_v2r8(ptrB+6,t9);
-    _fjsp_storeh_v2r8(ptrB+7,t9);
-    _fjsp_storel_v2r8(ptrB+8,t10);
+    t10         = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
+
+    tA          = _fjsp_unpacklo_v2r8(x1, y1);
+    tB          = _fjsp_unpackhi_v2r8(x1, y1);
+    tC          = _fjsp_unpacklo_v2r8(z1, x2);
+    tD          = _fjsp_unpackhi_v2r8(z1, x2);
+    tE          = _fjsp_unpacklo_v2r8(y2, z2);
+    tF          = _fjsp_unpackhi_v2r8(y2, z2);
+    tG          = _fjsp_unpacklo_v2r8(x3, y3);
+    tH          = _fjsp_unpackhi_v2r8(x3, y3);
+    tI          = _fjsp_unpackhi_v2r8(z3, z3);
+
+    t1          = _fjsp_sub_v2r8(t1, tA);
+    t2          = _fjsp_sub_v2r8(t2, tC);
+    t3          = _fjsp_sub_v2r8(t3, tE);
+    t4          = _fjsp_sub_v2r8(t4, tG);
+    t5          = _fjsp_sub_v2r8(t5, z3);
+
+    t6          = _fjsp_sub_v2r8(t6, tB);
+    t7          = _fjsp_sub_v2r8(t7, tD);
+    t8          = _fjsp_sub_v2r8(t8, tF);
+    t9          = _fjsp_sub_v2r8(t9, tH);
+    t10         = _fjsp_sub_v2r8(t10, tI);
+
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storeh_v2r8(ptrA+3, t2);
+    _fjsp_storel_v2r8(ptrA+4, t3);
+    _fjsp_storeh_v2r8(ptrA+5, t3);
+    _fjsp_storel_v2r8(ptrA+6, t4);
+    _fjsp_storeh_v2r8(ptrA+7, t4);
+    _fjsp_storel_v2r8(ptrA+8, t5);
+    _fjsp_storel_v2r8(ptrB, t6);
+    _fjsp_storeh_v2r8(ptrB+1, t6);
+    _fjsp_storel_v2r8(ptrB+2, t7);
+    _fjsp_storeh_v2r8(ptrB+3, t7);
+    _fjsp_storel_v2r8(ptrB+4, t8);
+    _fjsp_storeh_v2r8(ptrB+5, t8);
+    _fjsp_storel_v2r8(ptrB+6, t9);
+    _fjsp_storeh_v2r8(ptrB+7, t9);
+    _fjsp_storel_v2r8(ptrB+8, t10);
 }
 
 
 static void
 gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
-                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
-                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
-                                       _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
-{
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
-    _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
-    
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                           _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                           _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                           _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
+{
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
+    _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
+
     t1          = _fjsp_load_v2r8(ptrA);
     t2          = _fjsp_load_v2r8(ptrA+2);
     t3          = _fjsp_load_v2r8(ptrA+4);
@@ -730,112 +730,112 @@ gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double *
     t10         = _fjsp_load_v2r8(ptrB+6);
     t11         = _fjsp_load_v2r8(ptrB+8);
     t12         = _fjsp_load_v2r8(ptrB+10);
-    
-    tA          = _fjsp_unpacklo_v2r8(x1,y1);
-    tB          = _fjsp_unpackhi_v2r8(x1,y1);
-    tC          = _fjsp_unpacklo_v2r8(z1,x2);
-    tD          = _fjsp_unpackhi_v2r8(z1,x2);
-    tE          = _fjsp_unpacklo_v2r8(y2,z2);
-    tF          = _fjsp_unpackhi_v2r8(y2,z2);
-    tG          = _fjsp_unpacklo_v2r8(x3,y3);
-    tH          = _fjsp_unpackhi_v2r8(x3,y3);
-    tI          = _fjsp_unpacklo_v2r8(z3,x4);
-    tJ          = _fjsp_unpackhi_v2r8(z3,x4);
-    tK          = _fjsp_unpacklo_v2r8(y4,z4);
-    tL          = _fjsp_unpackhi_v2r8(y4,z4);
-    
-    t1          = _fjsp_sub_v2r8(t1,tA);
-    t2          = _fjsp_sub_v2r8(t2,tC);
-    t3          = _fjsp_sub_v2r8(t3,tE);
-    t4          = _fjsp_sub_v2r8(t4,tG);
-    t5          = _fjsp_sub_v2r8(t5,tI);
-    t6          = _fjsp_sub_v2r8(t6,tK);
-    
-    t7          = _fjsp_sub_v2r8(t7,tB);
-    t8          = _fjsp_sub_v2r8(t8,tD);
-    t9          = _fjsp_sub_v2r8(t9,tF);
-    t10         = _fjsp_sub_v2r8(t10,tH);
-    t11         = _fjsp_sub_v2r8(t11,tJ);
-    t12         = _fjsp_sub_v2r8(t12,tL);
-    
+
+    tA          = _fjsp_unpacklo_v2r8(x1, y1);
+    tB          = _fjsp_unpackhi_v2r8(x1, y1);
+    tC          = _fjsp_unpacklo_v2r8(z1, x2);
+    tD          = _fjsp_unpackhi_v2r8(z1, x2);
+    tE          = _fjsp_unpacklo_v2r8(y2, z2);
+    tF          = _fjsp_unpackhi_v2r8(y2, z2);
+    tG          = _fjsp_unpacklo_v2r8(x3, y3);
+    tH          = _fjsp_unpackhi_v2r8(x3, y3);
+    tI          = _fjsp_unpacklo_v2r8(z3, x4);
+    tJ          = _fjsp_unpackhi_v2r8(z3, x4);
+    tK          = _fjsp_unpacklo_v2r8(y4, z4);
+    tL          = _fjsp_unpackhi_v2r8(y4, z4);
+
+    t1          = _fjsp_sub_v2r8(t1, tA);
+    t2          = _fjsp_sub_v2r8(t2, tC);
+    t3          = _fjsp_sub_v2r8(t3, tE);
+    t4          = _fjsp_sub_v2r8(t4, tG);
+    t5          = _fjsp_sub_v2r8(t5, tI);
+    t6          = _fjsp_sub_v2r8(t6, tK);
+
+    t7          = _fjsp_sub_v2r8(t7, tB);
+    t8          = _fjsp_sub_v2r8(t8, tD);
+    t9          = _fjsp_sub_v2r8(t9, tF);
+    t10         = _fjsp_sub_v2r8(t10, tH);
+    t11         = _fjsp_sub_v2r8(t11, tJ);
+    t12         = _fjsp_sub_v2r8(t12, tL);
+
     _fjsp_storel_v2r8(ptrA,  t1);
-    _fjsp_storeh_v2r8(ptrA+1,t1);
-    _fjsp_storel_v2r8(ptrA+2,t2);
-    _fjsp_storeh_v2r8(ptrA+3,t2);
-    _fjsp_storel_v2r8(ptrA+4,t3);
-    _fjsp_storeh_v2r8(ptrA+5,t3);
-    _fjsp_storel_v2r8(ptrA+6,t4);
-    _fjsp_storeh_v2r8(ptrA+7,t4);
-    _fjsp_storel_v2r8(ptrA+8,t5);
-    _fjsp_storeh_v2r8(ptrA+9,t5);
-    _fjsp_storel_v2r8(ptrA+10,t6);
-    _fjsp_storeh_v2r8(ptrA+11,t6);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storeh_v2r8(ptrA+3, t2);
+    _fjsp_storel_v2r8(ptrA+4, t3);
+    _fjsp_storeh_v2r8(ptrA+5, t3);
+    _fjsp_storel_v2r8(ptrA+6, t4);
+    _fjsp_storeh_v2r8(ptrA+7, t4);
+    _fjsp_storel_v2r8(ptrA+8, t5);
+    _fjsp_storeh_v2r8(ptrA+9, t5);
+    _fjsp_storel_v2r8(ptrA+10, t6);
+    _fjsp_storeh_v2r8(ptrA+11, t6);
     _fjsp_storel_v2r8(ptrB,  t7);
-    _fjsp_storeh_v2r8(ptrB+1,t7);
-    _fjsp_storel_v2r8(ptrB+2,t8);
-    _fjsp_storeh_v2r8(ptrB+3,t8);
-    _fjsp_storel_v2r8(ptrB+4,t9);
-    _fjsp_storeh_v2r8(ptrB+5,t9);
-    _fjsp_storel_v2r8(ptrB+6,t10);
-    _fjsp_storeh_v2r8(ptrB+7,t10);
-    _fjsp_storel_v2r8(ptrB+8,t11);
-    _fjsp_storeh_v2r8(ptrB+9,t11);
-    _fjsp_storel_v2r8(ptrB+10,t12);
-    _fjsp_storeh_v2r8(ptrB+11,t12);
+    _fjsp_storeh_v2r8(ptrB+1, t7);
+    _fjsp_storel_v2r8(ptrB+2, t8);
+    _fjsp_storeh_v2r8(ptrB+3, t8);
+    _fjsp_storel_v2r8(ptrB+4, t9);
+    _fjsp_storeh_v2r8(ptrB+5, t9);
+    _fjsp_storel_v2r8(ptrB+6, t10);
+    _fjsp_storeh_v2r8(ptrB+7, t10);
+    _fjsp_storel_v2r8(ptrB+8, t11);
+    _fjsp_storeh_v2r8(ptrB+9, t11);
+    _fjsp_storel_v2r8(ptrB+10, t12);
+    _fjsp_storeh_v2r8(ptrB+11, t12);
 }
 
 
 
 static gmx_inline void
 gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
-                                      double * gmx_restrict fptr,
-                                      double * gmx_restrict fshiftptr)
+                                          double * gmx_restrict fptr,
+                                          double * gmx_restrict fshiftptr)
 {
-    __m128d t1,t2,t3,t4;
-    
+    __m128d t1, t2, t3, t4;
+
     /* transpose data */
-    t1 = fix1;
-    fix1 = _fjsp_unpacklo_v2r8(fix1,fiy1); /* y0 x0 */
-    fiy1 = _fjsp_unpackhi_v2r8(t1,fiy1);   /* y1 x1 */
-    
-    fix1 = _fjsp_add_v2r8(fix1,fiy1);
-    fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1,fiz1 ));
-    
+    t1   = fix1;
+    fix1 = _fjsp_unpacklo_v2r8(fix1, fiy1); /* y0 x0 */
+    fiy1 = _fjsp_unpackhi_v2r8(t1, fiy1);   /* y1 x1 */
+
+    fix1 = _fjsp_add_v2r8(fix1, fiy1);
+    fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1, fiz1 ));
+
     t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
     _fjsp_storel_v2r8( fptr, t4 );
     _fjsp_storeh_v2r8( fptr+1, t4 );
-    _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+2), fiz1 ));
-    
+    _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+2), fiz1 ));
+
     t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
     _fjsp_storel_v2r8( fshiftptr, t4 );
     _fjsp_storeh_v2r8( fshiftptr+1, t4 );
-    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
 }
 
 static gmx_inline void
 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
-                                      _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
-                                      _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
-                                      double * gmx_restrict fptr,
-                                      double * gmx_restrict fshiftptr)
+                                          _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                          _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                          double * gmx_restrict fptr,
+                                          double * gmx_restrict fshiftptr)
 {
-    __m128d t1,t2,t3,t4,t5,t6;
-    
+    __m128d t1, t2, t3, t4, t5, t6;
+
     /* transpose data */
-    GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
-    t1 = fix3;
-    fix3 = _fjsp_unpacklo_v2r8(fix3,fiy3); /* y0 x0 */
-    fiy3 = _fjsp_unpackhi_v2r8(t1,fiy3);   /* y1 x1 */
-    
-    fix1 = _fjsp_add_v2r8(fix1,fiy1);
-    fiz1 = _fjsp_add_v2r8(fiz1,fix2);
-    fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
-    
-    fix3 = _fjsp_add_v2r8(fix3,fiy3);
-    fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3,fiz3));
-    
+    GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
+    t1   = fix3;
+    fix3 = _fjsp_unpacklo_v2r8(fix3, fiy3); /* y0 x0 */
+    fiy3 = _fjsp_unpackhi_v2r8(t1, fiy3);   /* y1 x1 */
+
+    fix1 = _fjsp_add_v2r8(fix1, fiy1);
+    fiz1 = _fjsp_add_v2r8(fiz1, fix2);
+    fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
+
+    fix3 = _fjsp_add_v2r8(fix3, fiy3);
+    fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3, fiz3));
+
     t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
     t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
     t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
@@ -849,48 +849,48 @@ gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjs
     _fjsp_storeh_v2r8( fptr+5, t5 );
     _fjsp_storel_v2r8( fptr+6, t6 );
     _fjsp_storeh_v2r8( fptr+7, t6 );
-    _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+8), fiz3 ));
-    
-    fix1 = _fjsp_add_v2r8(fix1,fix3);
-    t1   = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
-    fix1 = _fjsp_add_v2r8(fix1,t1); /* x and y sums */
-    
-    t2   = _fjsp_shuffle_v2r8(fiy2,fiy2,GMX_FJSP_SHUFFLE2(1,1));
-    fiz1 = _fjsp_add_v2r8(fiz1,fiz3);
-    fiz1 = _fjsp_add_v2r8(fiz1,t2); /* z sum */
-    
+    _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+8), fiz3 ));
+
+    fix1 = _fjsp_add_v2r8(fix1, fix3);
+    t1   = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
+    fix1 = _fjsp_add_v2r8(fix1, t1); /* x and y sums */
+
+    t2   = _fjsp_shuffle_v2r8(fiy2, fiy2, GMX_FJSP_SHUFFLE2(1, 1));
+    fiz1 = _fjsp_add_v2r8(fiz1, fiz3);
+    fiz1 = _fjsp_add_v2r8(fiz1, t2); /* z sum */
+
     t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
     _fjsp_storel_v2r8( fshiftptr, t3 );
     _fjsp_storeh_v2r8( fshiftptr+1, t3 );
-    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
 }
 
 
 static gmx_inline void
 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
-                                      _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
-                                      _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
-                                      _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
-                                      double * gmx_restrict fptr,
-                                      double * gmx_restrict fshiftptr)
-{
-    __m128d t1,t2,t3,t4,t5,t6,t7,t8;
-    
+                                          _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                          _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                          _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
+                                          double * gmx_restrict fptr,
+                                          double * gmx_restrict fshiftptr)
+{
+    __m128d t1, t2, t3, t4, t5, t6, t7, t8;
+
     /* transpose data */
-    GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
-    GMX_FJSP_TRANSPOSE2_V2R8(fix3,fiy3);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiz3,fix4);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiy4,fiz4);
-    
-    fix1 = _fjsp_add_v2r8(fix1,fiy1);
-    fiz1 = _fjsp_add_v2r8(fiz1,fix2);
-    fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
-    fix3 = _fjsp_add_v2r8(fix3,fiy3);
-    fiz3 = _fjsp_add_v2r8(fiz3,fix4);
-    fiy4 = _fjsp_add_v2r8(fiy4,fiz4);
-    
+    GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fix3, fiy3);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz3, fix4);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy4, fiz4);
+
+    fix1 = _fjsp_add_v2r8(fix1, fiy1);
+    fiz1 = _fjsp_add_v2r8(fiz1, fix2);
+    fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
+    fix3 = _fjsp_add_v2r8(fix3, fiy3);
+    fiz3 = _fjsp_add_v2r8(fiz3, fix4);
+    fiy4 = _fjsp_add_v2r8(fiy4, fiz4);
+
     t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr),    fix1 );
     t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2),  fiz1 );
     t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4),  fiy2 );
@@ -910,20 +910,20 @@ gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjs
     _fjsp_storel_v2r8( fptr+10, t8 );
     _fjsp_storeh_v2r8( fptr+11, t8 );
 
-    t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
-    fix1 = _fjsp_add_v2r8(fix1,t1);
-    t2 = _fjsp_shuffle_v2r8(fiz3,fiy4,GMX_FJSP_SHUFFLE2(0,1));
-    fix3 = _fjsp_add_v2r8(fix3,t2);
-    fix1 = _fjsp_add_v2r8(fix1,fix3); /* x and y sums */
-    
-    fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2,fiy2));
-    fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4,fiy4));
-    fiz1 = _fjsp_add_v2r8(fiz1,fiz3); /* z sum */
-    
+    t1   = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
+    fix1 = _fjsp_add_v2r8(fix1, t1);
+    t2   = _fjsp_shuffle_v2r8(fiz3, fiy4, GMX_FJSP_SHUFFLE2(0, 1));
+    fix3 = _fjsp_add_v2r8(fix3, t2);
+    fix1 = _fjsp_add_v2r8(fix1, fix3); /* x and y sums */
+
+    fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2, fiy2));
+    fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4, fiy4));
+    fiz1 = _fjsp_add_v2r8(fiz1, fiz3); /* z sum */
+
     t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
     _fjsp_storel_v2r8( fshiftptr, t3 );
     _fjsp_storeh_v2r8( fshiftptr+1, t3 );
-    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
 }
 
 
@@ -931,20 +931,20 @@ gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjs
 static gmx_inline void
 gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
 {
-    pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1,pot1));
-    _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
+    pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1, pot1));
+    _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
 }
 
 static gmx_inline void
 gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
-                      _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
-{
-    GMX_FJSP_TRANSPOSE2_V2R8(pot1,pot2);
-    pot1 = _fjsp_add_v2r8(pot1,pot2);
-    pot2 = _fjsp_unpackhi_v2r8(pot1,pot1);
-    
-    _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
-    _fjsp_storel_v2r8(ptrB,_fjsp_add_v2r8(pot2,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB)));
+                          _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
+{
+    GMX_FJSP_TRANSPOSE2_V2R8(pot1, pot2);
+    pot1 = _fjsp_add_v2r8(pot1, pot2);
+    pot2 = _fjsp_unpackhi_v2r8(pot1, pot1);
+
+    _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
+    _fjsp_storel_v2r8(ptrB, _fjsp_add_v2r8(pot2, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB)));
 }
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
index e8dbdebe76..72deaf013d 100755
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
@@ -495,13 +495,13 @@ pass
 # Write out the list of settings and corresponding kernels to the declaration file
 fpdecl.write( '\n\n' )
 fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( '    kernellist_'+Arch+'[] =\n' )
 fpdecl.write( '{\n' )
 for decl in kerneldecl[0:-1]:
     fpdecl.write( decl + ',\n' )
 fpdecl.write( kerneldecl[-1] + '\n' )
 fpdecl.write( '};\n\n' )
 fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( '    kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
 fpdecl.write( '#endif\n')
 fpdecl.close()
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
index 4471c61789..d603feb1b8 100644
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
@@ -257,7 +257,7 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
 
 
 nb_kernel_info_t
-kernellist_sparc64_hpc_ace_double[] =
+    kernellist_sparc64_hpc_ace_double[] =
 {
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
@@ -476,6 +476,6 @@ kernellist_sparc64_hpc_ace_double[] =
 };
 
 int
-kernellist_sparc64_hpc_ace_double_size = sizeof(kernellist_sparc64_hpc_ace_double)/sizeof(kernellist_sparc64_hpc_ace_double[0]);
+    kernellist_sparc64_hpc_ace_double_size = sizeof(kernellist_sparc64_hpc_ace_double)/sizeof(kernellist_sparc64_hpc_ace_double[0]);
 
 #endif
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h
index 3260a35c6b..67ace2f0e7 100644
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h
@@ -40,10 +40,10 @@
 
 /* List of kernels for this architecture with metadata about them */
 extern nb_kernel_info_t
-kernellist_sparc64_hpc_ace_double[];
+    kernellist_sparc64_hpc_ace_double[];
 
 /* Length of kernellist_c */
 extern int
-kernellist_sparc64_hpc_ace_double_size;
+    kernellist_sparc64_hpc_ace_double_size;
 
 #endif
diff --git a/src/gromacs/gmxpreprocess/gen_vsite.c b/src/gromacs/gmxpreprocess/gen_vsite.c
index 106f23877e..f3ad7fa4e4 100644
--- a/src/gromacs/gmxpreprocess/gen_vsite.c
+++ b/src/gromacs/gmxpreprocess/gen_vsite.c
@@ -1076,16 +1076,16 @@ static int gen_vsites_trp(gpp_atomtype_t atype, rvec *newx[],
     for (j = 0; j < NMASS; j++)
     {
         sprintf(name, "MW%d", j+1);
-        (*newatomname)  [atM[j]]        = put_symtab(symtab, name);
-        (*newatom)      [atM[j]].m      = (*newatom)[atM[j]].mB    = mM[j];
-        (*newatom)      [atM[j]].q      = (*newatom)[atM[j]].qB    = 0.0;
-        (*newatom)      [atM[j]].type   = (*newatom)[atM[j]].typeB = tpM;
-        (*newatom)      [atM[j]].ptype  = eptAtom;
-        (*newatom)      [atM[j]].resind = at->atom[i0].resind;
+        (*newatomname)  [atM[j]]         = put_symtab(symtab, name);
+        (*newatom)      [atM[j]].m       = (*newatom)[atM[j]].mB    = mM[j];
+        (*newatom)      [atM[j]].q       = (*newatom)[atM[j]].qB    = 0.0;
+        (*newatom)      [atM[j]].type    = (*newatom)[atM[j]].typeB = tpM;
+        (*newatom)      [atM[j]].ptype   = eptAtom;
+        (*newatom)      [atM[j]].resind  = at->atom[i0].resind;
         (*newatom)      [atM[j]].elem[0] = 'M';
         (*newatom)      [atM[j]].elem[1] = '\0';
-        (*newvsite_type)[atM[j]]        = NOTSET;
-        (*newcgnr)      [atM[j]]        = (*cgnr)[i0];
+        (*newvsite_type)[atM[j]]         = NOTSET;
+        (*newcgnr)      [atM[j]]         = (*cgnr)[i0];
     }
     /* renumber cgnr: */
     for (i = i0; i < at->nr; i++)
@@ -1260,16 +1260,16 @@ static int gen_vsites_tyr(gpp_atomtype_t atype, rvec *newx[],
     rvec_add(r1, x[ats[atHH]], (*newx)[atM]);
 
     strcpy(name, "MW1");
-    (*newatomname)  [atM]        = put_symtab(symtab, name);
-    (*newatom)      [atM].m      = (*newatom)[atM].mB    = mM;
-    (*newatom)      [atM].q      = (*newatom)[atM].qB    = 0.0;
-    (*newatom)      [atM].type   = (*newatom)[atM].typeB = tpM;
-    (*newatom)      [atM].ptype  = eptAtom;
-    (*newatom)      [atM].resind = at->atom[i0].resind;
+    (*newatomname)  [atM]         = put_symtab(symtab, name);
+    (*newatom)      [atM].m       = (*newatom)[atM].mB    = mM;
+    (*newatom)      [atM].q       = (*newatom)[atM].qB    = 0.0;
+    (*newatom)      [atM].type    = (*newatom)[atM].typeB = tpM;
+    (*newatom)      [atM].ptype   = eptAtom;
+    (*newatom)      [atM].resind  = at->atom[i0].resind;
     (*newatom)      [atM].elem[0] = 'M';
     (*newatom)      [atM].elem[1] = '\0';
-    (*newvsite_type)[atM]        = NOTSET;
-    (*newcgnr)      [atM]        = (*cgnr)[i0];
+    (*newvsite_type)[atM]         = NOTSET;
+    (*newcgnr)      [atM]         = (*cgnr)[i0];
     /* renumber cgnr: */
     for (i = i0; i < at->nr; i++)
     {
@@ -1988,18 +1988,18 @@ void do_vsites(int nrtp, t_restp rtp[], gpp_atomtype_t atype,
                         {
                             name[k+1] = (*at->atomname[Heavy])[k];
                         }
-                        name[k+1]             = atomnamesuffix[j];
-                        name[k+2]             = '\0';
-                        newatomname[ni0+j]    = put_symtab(symtab, name);
-                        newatom[ni0+j].m      = newatom[ni0+j].mB    = mtot/NMASS;
-                        newatom[ni0+j].q      = newatom[ni0+j].qB    = 0.0;
-                        newatom[ni0+j].type   = newatom[ni0+j].typeB = tpM;
-                        newatom[ni0+j].ptype  = eptAtom;
-                        newatom[ni0+j].resind = at->atom[i0].resind;
+                        name[k+1]              = atomnamesuffix[j];
+                        name[k+2]              = '\0';
+                        newatomname[ni0+j]     = put_symtab(symtab, name);
+                        newatom[ni0+j].m       = newatom[ni0+j].mB    = mtot/NMASS;
+                        newatom[ni0+j].q       = newatom[ni0+j].qB    = 0.0;
+                        newatom[ni0+j].type    = newatom[ni0+j].typeB = tpM;
+                        newatom[ni0+j].ptype   = eptAtom;
+                        newatom[ni0+j].resind  = at->atom[i0].resind;
                         newatom[ni0+j].elem[0] = 'M';
                         newatom[ni0+j].elem[1] = '\0';
-                        newvsite_type[ni0+j]  = NOTSET;
-                        newcgnr[ni0+j]        = (*cgnr)[i0];
+                        newvsite_type[ni0+j]   = NOTSET;
+                        newcgnr[ni0+j]         = (*cgnr)[i0];
                     }
                     /* add constraints between dummy masses and to heavies[0] */
                     /* 'add_shift' says which atoms won't be renumbered afterwards */
diff --git a/src/gromacs/gmxpreprocess/readpull.c b/src/gromacs/gmxpreprocess/readpull.c
index b4ba145312..ce9fb64b2f 100644
--- a/src/gromacs/gmxpreprocess/readpull.c
+++ b/src/gromacs/gmxpreprocess/readpull.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014 by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -72,7 +72,7 @@ static void string2dvec(const char buf[], dvec nums)
 }
 
 static void init_pull_group(t_pull_group *pg,
-                            const char *wbuf)
+                            const char   *wbuf)
 {
     double d;
     int    n, m;
@@ -98,7 +98,7 @@ static void init_pull_coord(t_pull_coord *pcrd, int eGeom,
     string2dvec(origin_buf, origin);
     if (pcrd->group[0] != 0 && dnorm(origin) > 0)
     {
-        gmx_fatal(FARGS,"The pull origin can only be set with an absolute reference");
+        gmx_fatal(FARGS, "The pull origin can only be set with an absolute reference");
     }
 
     if (eGeom == epullgDIST)
@@ -235,7 +235,7 @@ char **read_pullparams(int *ninp_p, t_inpfile **inp_p,
     return grpbuf;
 }
 
-void make_pull_groups(t_pull *pull, 
+void make_pull_groups(t_pull *pull,
                       char **pgnames,
                       const t_blocka *grps, char **gnames)
 {
@@ -413,14 +413,14 @@ void set_pull_init(t_inputrec *ir, gmx_mtop_t *mtop, rvec *x, matrix box, real l
     {
         pcrd  = &pull->coord[c];
 
-        pgrp0 = &pull->group[pcrd->group[0]]; 
-        pgrp1 = &pull->group[pcrd->group[1]]; 
+        pgrp0 = &pull->group[pcrd->group[0]];
+        pgrp1 = &pull->group[pcrd->group[1]];
         fprintf(stderr, "%8d  %8d  %8d\n",
                 pcrd->group[0], pgrp0->nat, pgrp0->pbcatom+1);
         fprintf(stderr, "%8d  %8d  %8d ",
                 pcrd->group[1], pgrp1->nat, pgrp1->pbcatom+1);
 
-        init = pcrd->init;
+        init       = pcrd->init;
         pcrd->init = 0;
 
         if (pcrd->rate == 0)
diff --git a/src/gromacs/gmxpreprocess/toppush.c b/src/gromacs/gmxpreprocess/toppush.c
index 88a16f6d26..6f2badc2e5 100644
--- a/src/gromacs/gmxpreprocess/toppush.c
+++ b/src/gromacs/gmxpreprocess/toppush.c
@@ -2514,14 +2514,14 @@ int add_atomtype_decoupled(t_symtab *symtab, gpp_atomtype_t at,
 static void convert_pairs_to_pairsQ(t_params *plist,
                                     real fudgeQQ, t_atoms *atoms)
 {
-    t_param *paramp1,*paramp2,*paramnew;
-    int      i,j,p1nr,p2nr,p2newnr;
+    t_param *paramp1, *paramp2, *paramnew;
+    int      i, j, p1nr, p2nr, p2newnr;
 
     /* Add the pair list to the pairQ list */
-    p1nr = plist[F_LJ14].nr;
-    p2nr = plist[F_LJC14_Q].nr;
+    p1nr    = plist[F_LJ14].nr;
+    p2nr    = plist[F_LJC14_Q].nr;
     p2newnr = p1nr + p2nr;
-    snew(paramnew,p2newnr);
+    snew(paramnew, p2newnr);
 
     paramp1             = plist[F_LJ14].param;
     paramp2             = plist[F_LJC14_Q].param;
@@ -2530,18 +2530,18 @@ static void convert_pairs_to_pairsQ(t_params *plist,
        it may be possible to just ADD the converted F_LJ14 array
        to the old F_LJC14_Q array, but since we have to create
        a new sized memory structure, better just to deep copy it all.
-    */
+     */
 
     for (i = 0; i < p2nr; i++)
     {
         /* Copy over parameters */
-        for (j=0;j<5;j++) /* entries are 0-4 for F_LJC14_Q */
+        for (j = 0; j < 5; j++) /* entries are 0-4 for F_LJC14_Q */
         {
             paramnew[i].c[j] = paramp2[i].c[j];
         }
 
         /* copy over atoms */
-        for (j=0;j<2;j++)
+        for (j = 0; j < 2; j++)
         {
             paramnew[i].a[j] = paramp2[i].a[j];
         }
diff --git a/src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h b/src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
index 403634cd5b..122dff5327 100644
--- a/src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
+++ b/src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
@@ -113,7 +113,7 @@ typedef struct tMPI_Spinlock
 #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;" : : : "memory")
 #else
 /* MIC is in-order and does not need nor support sfense */
-#define tMPI_Atomic_memory_barrier() __asm__ __volatile__("":::"memory")
+#define tMPI_Atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
 #endif
 
 #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
diff --git a/src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h b/src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h
index 3b4f985db3..19f7bb6a49 100644
--- a/src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h
+++ b/src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h
@@ -141,7 +141,7 @@ static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
 static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a, void* oldval,
                                       void* newval)
 {
-    int ret;
+    int                     ret;
     volatile char* volatile oldv = (char*)oldval;
     volatile char* volatile newv = (char*)newval;
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 5a3a8fd755..d8be3de2e2 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -42,7 +42,7 @@
 
 #include <cuda.h>
 
-#include "types/simple.h" 
+#include "types/simple.h"
 #include "types/nbnxn_pairlist.h"
 #include "types/nb_verlet.h"
 #include "types/ishift.h"
@@ -153,7 +153,7 @@ static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
  */
 static const nbnxn_cu_kfunc_ptr_t
-nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+    nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 {
     { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
       { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
@@ -202,7 +202,7 @@ static inline int calc_shmem_required()
     return shmem;
 }
 
-/*! As we execute nonbonded workload in separate streams, before launching 
+/*! As we execute nonbonded workload in separate streams, before launching
    the kernel we need to make sure that he following operations have completed:
    - atomdata allocation and related H2D transfers (every nstlist step);
    - pair list H2D transfer (every nstlist step);
@@ -217,28 +217,28 @@ static inline int calc_shmem_required()
    However, for the sake of having a future-proof implementation, we use the
    misc_ops_done event to record the point in time when the above  operations
    are finished and synchronize with this event in the non-local stream.
-*/
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
+ */
+void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
                               const nbnxn_atomdata_t *nbatom,
-                              int flags,
-                              int iloc)
+                              int                     flags,
+                              int                     iloc)
 {
-    cudaError_t stat;
-    int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
+    cudaError_t          stat;
+    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
     /* CUDA kernel launch-related stuff */
-    int  shmem, nblock;
-    dim3 dim_block, dim_grid;
+    int                  shmem, nblock;
+    dim3                 dim_block, dim_grid;
     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 
-    cu_atomdata_t   *adat   = cu_nb->atdat;
-    cu_nbparam_t    *nbp    = cu_nb->nbparam;
-    cu_plist_t      *plist  = cu_nb->plist[iloc];
-    cu_timers_t     *t      = cu_nb->timers;
-    cudaStream_t    stream  = cu_nb->stream[iloc];
+    cu_atomdata_t       *adat    = cu_nb->atdat;
+    cu_nbparam_t        *nbp     = cu_nb->nbparam;
+    cu_plist_t          *plist   = cu_nb->plist[iloc];
+    cu_timers_t         *t       = cu_nb->timers;
+    cudaStream_t         stream  = cu_nb->stream[iloc];
 
-    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
-    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
-    bool bDoTime     = cu_nb->bDoTime;
+    bool                 bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                 bDoTime     = cu_nb->bDoTime;
 
     /* turn energy calculation always on/off (for debugging/testing only) */
     bCalcEner = (bCalcEner || always_ener) && !never_ener;
@@ -286,7 +286,7 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 
     /* HtoD x, q */
     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
-                      adat_len * sizeof(*adat->xq), stream); 
+                      adat_len * sizeof(*adat->xq), stream);
 
     if (bDoTime)
     {
@@ -320,7 +320,7 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
                 NCL_PER_SUPERCL, plist->na_c);
     }
 
-    nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
+    nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
     CU_LAUNCH_ERR("k_calc_nb");
 
     if (bDoTime)
@@ -330,14 +330,14 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
     }
 }
 
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
                                const nbnxn_atomdata_t *nbatom,
-                               int flags,
-                               int aloc)
+                               int                     flags,
+                               int                     aloc)
 {
     cudaError_t stat;
-    int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
-    int iloc = -1;
+    int         adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
+    int         iloc = -1;
 
     /* determine interaction locality from atom locality */
     if (LOCAL_A(aloc))
@@ -356,13 +356,13 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
         gmx_incons(stmp);
     }
 
-    cu_atomdata_t   *adat   = cu_nb->atdat;
-    cu_timers_t     *t      = cu_nb->timers;
-    bool            bDoTime = cu_nb->bDoTime;
-    cudaStream_t    stream  = cu_nb->stream[iloc];
+    cu_atomdata_t   *adat    = cu_nb->atdat;
+    cu_timers_t     *t       = cu_nb->timers;
+    bool             bDoTime = cu_nb->bDoTime;
+    cudaStream_t     stream  = cu_nb->stream[iloc];
 
-    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
-    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool             bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
     /* don't launch copy-back if there was no work to do */
     if (cu_nb->plist[iloc]->nsci == 0)
@@ -419,7 +419,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
     }
 
-    /* With DD the local D2H transfer can only start after the non-local 
+    /* With DD the local D2H transfer can only start after the non-local
        has been launched. */
     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
     {
@@ -428,7 +428,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
     }
 
     /* DtoH f */
-    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin, 
+    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
                       (adat_len)*sizeof(*adat->f), stream);
 
     /* After the non-local D2H is launched the nonlocal_done event can be
@@ -472,8 +472,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
  * polling wait for the GPU.
  */
 static inline bool atomic_cas(volatile unsigned int *ptr,
-                              unsigned int oldval,
-                              unsigned int newval)
+                              unsigned int           oldval,
+                              unsigned int           newval)
 {
     assert(ptr);
 
@@ -491,8 +491,8 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
                          real *e_lj, real *e_el, rvec *fshift)
 {
     /* NOTE:  only implemented for single-precision at this time */
-    cudaError_t stat;
-    int i, adat_end, iloc = -1;
+    cudaError_t            stat;
+    int                    i, adat_end, iloc = -1;
     volatile unsigned int *poll_word;
 
     /* determine interaction locality from atom locality */
@@ -512,16 +512,16 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
         gmx_incons(stmp);
     }
 
-    cu_plist_t      *plist   = cu_nb->plist[iloc];
-    cu_timers_t     *timers  = cu_nb->timers;
-    wallclock_gpu_t *timings = cu_nb->timings;
-    nb_staging      nbst     = cu_nb->nbst;
+    cu_plist_t      *plist    = cu_nb->plist[iloc];
+    cu_timers_t     *timers   = cu_nb->timers;
+    wallclock_gpu_t *timings  = cu_nb->timings;
+    nb_staging       nbst     = cu_nb->nbst;
 
-    bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
-    bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool             bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
     /* turn energy calculation always on/off (for debugging/testing only) */
-    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
 
     /* don't launch wait/update timers & counters if there was no work to do
 
@@ -548,7 +548,7 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
     }
-    else 
+    else
     {
         /* Busy-wait until we get the signal pattern set in last byte
          * of the l/nl float vector. This pattern corresponds to a floating
@@ -557,7 +557,9 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
          * The polling uses atomic compare-exchange.
          */
         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
-        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
+        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern))
+        {
+        }
     }
 
     /* timing data accumulation */
@@ -576,9 +578,9 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 
         /* X/q H2D and F D2H timings */
         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
-                                                 timers->stop_nb_h2d[iloc]);
+                                              timers->stop_nb_h2d[iloc]);
         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
-                                                 timers->stop_nb_d2h[iloc]);
+                                              timers->stop_nb_d2h[iloc]);
 
         /* only count atdat and pair-list H2D at pair-search step */
         if (plist->bDoPrune)
@@ -588,11 +590,11 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
             {
                 timings->pl_h2d_c++;
                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
-                                                         timers->stop_atdat);
+                                                      timers->stop_atdat);
             }
 
             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
-                                                     timers->stop_pl_h2d[iloc]);
+                                                  timers->stop_pl_h2d[iloc]);
         }
     }
 
@@ -621,13 +623,13 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 }
 
 /*! Return the reference to the nbfp texture. */
-const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
 {
     return nbfp_texref;
 }
 
 /*! Return the reference to the coulomb_tab. */
-const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
 {
     return coulomb_tab_texref;
 }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 5de2e80988..fd75a3d1f0 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -70,8 +70,8 @@ static unsigned int gpu_min_ci_balanced_factor = 40;
 
 /* Functions from nbnxn_cuda.cu */
 extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
-extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
-extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
 
 /* We should actually be using md_print_warn in md_logging.c,
  * but we can't include mpi.h in CUDA code.
@@ -113,9 +113,9 @@ static void init_ewald_coulomb_force_table(cu_nbparam_t          *nbp,
                                            const cuda_dev_info_t *dev_info)
 {
     float       *ftmp, *coul_tab;
-    int         tabsize;
-    double      tabscale;
-    cudaError_t stat;
+    int          tabsize;
+    double       tabscale;
+    cudaError_t  stat;
 
     tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
     /* Subtract 2 iso 1 to avoid access out of range due to rounding */
@@ -182,7 +182,7 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
     cudaError_t stat;
 
     ad->ntypes  = ntypes;
-    stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
+    stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
     CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
     ad->bShiftVecUploaded = false;
 
@@ -259,25 +259,25 @@ static int pick_ewald_kernel_type(bool                   bTwinCut,
 
 
 /*! Initializes the nonbonded parameter data structure. */
-static void init_nbparam(cu_nbparam_t *nbp,
+static void init_nbparam(cu_nbparam_t              *nbp,
                          const interaction_const_t *ic,
-                         const nbnxn_atomdata_t *nbat,
-                         const cuda_dev_info_t *dev_info)
+                         const nbnxn_atomdata_t    *nbat,
+                         const cuda_dev_info_t     *dev_info)
 {
     cudaError_t stat;
     int         ntypes, nnbfp;
 
     ntypes  = nbat->ntype;
 
-    nbp->ewald_beta = ic->ewaldcoeff_q;
-    nbp->sh_ewald   = ic->sh_ewald;
-    nbp->epsfac     = ic->epsfac;
-    nbp->two_k_rf   = 2.0 * ic->k_rf;
-    nbp->c_rf       = ic->c_rf;
-    nbp->rvdw_sq    = ic->rvdw * ic->rvdw;
-    nbp->rcoulomb_sq= ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq   = ic->rlist * ic->rlist;
-    nbp->sh_invrc6  = ic->sh_invrc6;
+    nbp->ewald_beta  = ic->ewaldcoeff_q;
+    nbp->sh_ewald    = ic->sh_ewald;
+    nbp->epsfac      = ic->epsfac;
+    nbp->two_k_rf    = 2.0 * ic->k_rf;
+    nbp->c_rf        = ic->c_rf;
+    nbp->rvdw_sq     = ic->rvdw * ic->rvdw;
+    nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
+    nbp->rlist_sq    = ic->rlist * ic->rlist;
+    nbp->sh_invrc6   = ic->sh_invrc6;
 
     if (ic->eeltype == eelCUT)
     {
@@ -287,7 +287,7 @@ static void init_nbparam(cu_nbparam_t *nbp,
     {
         nbp->eeltype = eelCuRF;
     }
-    else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
+    else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
     {
         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
         nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
@@ -306,41 +306,41 @@ static void init_nbparam(cu_nbparam_t *nbp,
     }
 
     nnbfp = 2*ntypes*ntypes;
-    stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
+    stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 
 #ifdef TEXOBJ_SUPPORTED
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (dev_info->prop.major >= 3)
-        {
-            cudaResourceDesc rd;
-            memset(&rd, 0, sizeof(rd));
-            rd.resType                  = cudaResourceTypeLinear;
-            rd.res.linear.devPtr        = nbp->nbfp;
-            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
-            rd.res.linear.desc.x        = 32;
-            rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
-
-            cudaTextureDesc td;
-            memset(&td, 0, sizeof(td));
-            td.readMode                 = cudaReadModeElementType;
-            stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
-            CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
-        }
-        else
+    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+    if (dev_info->prop.major >= 3)
+    {
+        cudaResourceDesc rd;
+        memset(&rd, 0, sizeof(rd));
+        rd.resType                  = cudaResourceTypeLinear;
+        rd.res.linear.devPtr        = nbp->nbfp;
+        rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+        rd.res.linear.desc.x        = 32;
+        rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
+
+        cudaTextureDesc td;
+        memset(&td, 0, sizeof(td));
+        td.readMode                 = cudaReadModeElementType;
+        stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
+        CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
+    }
+    else
 #endif
-        {
-            cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
-            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
-                                   nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
-            CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
-        }
+    {
+        cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
+        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
+                               nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
+        CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
+    }
 }
 
 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t           cu_nb,
                                          const interaction_const_t *ic)
 {
     cu_nbparam_t *nbp = cu_nb->nbparam;
@@ -379,7 +379,7 @@ static void init_plist(cu_plist_t *pl)
 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 {
     cudaError_t stat;
-    int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync: cudaEventDefault );
+    int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
 
     stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
     CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
@@ -419,12 +419,12 @@ static void init_timings(wallclock_gpu_t *t)
 
     t->nb_h2d_t = 0.0;
     t->nb_d2h_t = 0.0;
-    t->nb_c    = 0;
+    t->nb_c     = 0;
     t->pl_h2d_t = 0.0;
     t->pl_h2d_c = 0;
     for (i = 0; i < 2; i++)
     {
-        for(j = 0; j < 2; j++)
+        for (j = 0; j < 2; j++)
         {
             t->ktime[i][j].t = 0.0;
             t->ktime[i][j].c = 0;
@@ -432,22 +432,25 @@ static void init_timings(wallclock_gpu_t *t)
     }
 }
 
-void nbnxn_cuda_init(FILE *fplog,
-                     nbnxn_cuda_ptr_t *p_cu_nb,
+void nbnxn_cuda_init(FILE                 *fplog,
+                     nbnxn_cuda_ptr_t     *p_cu_nb,
                      const gmx_gpu_info_t *gpu_info,
-                     const gmx_gpu_opt_t *gpu_opt,
-                     int my_gpu_index,
-                     gmx_bool bLocalAndNonlocal)
+                     const gmx_gpu_opt_t  *gpu_opt,
+                     int                   my_gpu_index,
+                     gmx_bool              bLocalAndNonlocal)
 {
-    cudaError_t stat;
+    cudaError_t       stat;
     nbnxn_cuda_ptr_t  nb;
-    char sbuf[STRLEN];
-    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
-    int cuda_drv_ver;
+    char              sbuf[STRLEN];
+    bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
+    int               cuda_drv_ver;
 
     assert(gpu_info);
 
-    if (p_cu_nb == NULL) return;
+    if (p_cu_nb == NULL)
+    {
+        return;
+    }
 
     snew(nb, 1);
     snew(nb->atdat, 1);
@@ -516,7 +519,7 @@ void nbnxn_cuda_init(FILE *fplog,
      * With polling wait event-timing also needs to be disabled.
      *
      * The overhead is greatly reduced in API v5.0 drivers and the improvement
-     $ is independent of runtime version. Hence, with API v5.0 drivers and later
+     * is independent of runtime version. Hence, with API v5.0 drivers and later
      * we won't switch to polling.
      *
      * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
@@ -592,8 +595,8 @@ void nbnxn_cuda_init(FILE *fplog,
                         "      However, the polling wait workaround can not be used because\n%s\n"
                         "      Consider updating the driver or turning ECC off.",
                         (bX86 && bTMPIAtomics) ?
-                            "      GPU(s) are being oversubscribed." :
-                            "      atomic operations are not supported by the platform/CPU+compiler.");
+                        "      GPU(s) are being oversubscribed." :
+                        "      atomic operations are not supported by the platform/CPU+compiler.");
                 md_print_warn(fplog, sbuf);
             }
         }
@@ -651,15 +654,15 @@ void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t                cu_nb,
     nbnxn_cuda_clear_e_fshift(cu_nb);
 }
 
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
                               const nbnxn_pairlist_t *h_plist,
-                              int iloc)
+                              int                     iloc)
 {
-    char         sbuf[STRLEN];
-    cudaError_t  stat;
-    bool         bDoTime    = cu_nb->bDoTime;
-    cudaStream_t stream     = cu_nb->stream[iloc];
-    cu_plist_t   *d_plist   = cu_nb->plist[iloc];
+    char          sbuf[STRLEN];
+    cudaError_t   stat;
+    bool          bDoTime    = cu_nb->bDoTime;
+    cudaStream_t  stream     = cu_nb->stream[iloc];
+    cu_plist_t   *d_plist    = cu_nb->plist[iloc];
 
     if (d_plist->na_c < 0)
     {
@@ -682,19 +685,19 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
     }
 
     cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
-                         &d_plist->nsci, &d_plist->sci_nalloc,
-                         h_plist->nsci,
-                         stream, true);
+                        &d_plist->nsci, &d_plist->sci_nalloc,
+                        h_plist->nsci,
+                        stream, true);
 
     cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
-                         &d_plist->ncj4, &d_plist->cj4_nalloc,
-                         h_plist->ncj4,
-                         stream, true);
+                        &d_plist->ncj4, &d_plist->cj4_nalloc,
+                        h_plist->ncj4,
+                        stream, true);
 
     cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
-                         &d_plist->nexcl, &d_plist->excl_nalloc,
-                         h_plist->nexcl,
-                         stream, true);
+                        &d_plist->nexcl, &d_plist->excl_nalloc,
+                        h_plist->nexcl,
+                        stream, true);
 
     if (bDoTime)
     {
@@ -706,16 +709,16 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
     d_plist->bDoPrune = true;
 }
 
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t        cu_nb,
                                 const nbnxn_atomdata_t *nbatom)
 {
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
     {
-        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 
+        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
                           SHIFTS * sizeof(*adat->shift_vec), ls);
         adat->bShiftVecUploaded = true;
     }
@@ -724,9 +727,9 @@ void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
 {
-    cudaError_t   stat;
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cudaError_t    stat;
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
 
     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -735,9 +738,9 @@ static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
 {
-    cudaError_t   stat;
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cudaError_t    stat;
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
 
     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -750,7 +753,7 @@ static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
 void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
 {
     nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
-    /* clear shift force array and energies if the outputs were 
+    /* clear shift force array and energies if the outputs were
        used in the current step */
     if (flags & GMX_FORCE_VIRIAL)
     {
@@ -758,18 +761,18 @@ void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
     }
 }
 
-void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t        cu_nb,
                               const nbnxn_atomdata_t *nbat)
 {
-    cudaError_t   stat;
-    int           nalloc, natoms;
-    bool          realloced;
-    bool          bDoTime   = cu_nb->bDoTime;
-    cu_timers_t   *timers   = cu_nb->timers;
-    cu_atomdata_t *d_atdat  = cu_nb->atdat;
-    cudaStream_t  ls        = cu_nb->stream[eintLocal];
-
-    natoms = nbat->natoms;
+    cudaError_t    stat;
+    int            nalloc, natoms;
+    bool           realloced;
+    bool           bDoTime   = cu_nb->bDoTime;
+    cu_timers_t   *timers    = cu_nb->timers;
+    cu_atomdata_t *d_atdat   = cu_nb->atdat;
+    cudaStream_t   ls        = cu_nb->stream[eintLocal];
+
+    natoms    = nbat->natoms;
     realloced = false;
 
     if (bDoTime)
@@ -802,10 +805,10 @@ void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 
         d_atdat->nalloc = nalloc;
-        realloced = true;
+        realloced       = true;
     }
 
-    d_atdat->natoms = natoms;
+    d_atdat->natoms       = natoms;
     d_atdat->natoms_local = nbat->natoms_local;
 
     /* need to clear GPU f output if realloc happened */
@@ -826,13 +829,16 @@ void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
 
 void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
 {
-    cudaError_t     stat;
+    cudaError_t      stat;
     cu_atomdata_t   *atdat;
     cu_nbparam_t    *nbparam;
     cu_plist_t      *plist, *plist_nl;
     cu_timers_t     *timers;
 
-    if (cu_nb == NULL) return;
+    if (cu_nb == NULL)
+    {
+        return;
+    }
 
     atdat       = cu_nb->atdat;
     nbparam     = cu_nb->nbparam;
@@ -957,7 +963,7 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
 
 void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
 {
-    cudaError_t stat;
+    cudaError_t  stat;
     cudaStream_t stream = cu_nb->stream[iloc];
 
     stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
@@ -980,7 +986,7 @@ void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
 int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
 {
     return cu_nb != NULL ?
-        gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
+           gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
 
 }
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 366ae8c188..33214acd45 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -60,7 +60,7 @@
     Each thread calculates an i force-component taking one pair of i-j atoms.
  */
 #if __CUDA_ARCH__ >= 350
-__launch_bounds__(64,16)
+__launch_bounds__(64, 16)
 #endif
 #ifdef PRUNE_NBL
 #ifdef CALC_ENERGIES
@@ -75,27 +75,27 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener)
 __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #endif
 #endif
-            (const cu_atomdata_t atdat,
-             const cu_nbparam_t nbparam,
-             const cu_plist_t plist,
-             bool bCalcFshift)
+(const cu_atomdata_t atdat,
+ const cu_nbparam_t nbparam,
+ const cu_plist_t plist,
+ bool bCalcFshift)
 {
     /* convenience variables */
-    const nbnxn_sci_t *pl_sci   = plist.sci;
+    const nbnxn_sci_t *pl_sci       = plist.sci;
 #ifndef PRUNE_NBL
     const
 #endif
-    nbnxn_cj4_t *pl_cj4         = plist.cj4;
-    const nbnxn_excl_t *excl    = plist.excl;
-    const int *atom_types       = atdat.atom_types;
-    int ntypes                  = atdat.ntypes;
-    const float4 *xq            = atdat.xq;
-    float3 *f                   = atdat.f;
-    const float3 *shift_vec     = atdat.shift_vec;
-    float rcoulomb_sq           = nbparam.rcoulomb_sq;
+    nbnxn_cj4_t        *pl_cj4      = plist.cj4;
+    const nbnxn_excl_t *excl        = plist.excl;
+    const int          *atom_types  = atdat.atom_types;
+    int                 ntypes      = atdat.ntypes;
+    const float4       *xq          = atdat.xq;
+    float3             *f           = atdat.f;
+    const float3       *shift_vec   = atdat.shift_vec;
+    float               rcoulomb_sq = nbparam.rcoulomb_sq;
 #ifdef VDW_CUTOFF_CHECK
-    float rvdw_sq               = nbparam.rvdw_sq;
-    float vdw_in_range;
+    float               rvdw_sq     = nbparam.rvdw_sq;
+    float               vdw_in_range;
 #endif
 #ifdef EL_RF
     float two_k_rf              = nbparam.two_k_rf;
@@ -112,12 +112,12 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #endif
 
 #ifdef CALC_ENERGIES
-    float lj_shift    = nbparam.sh_invrc6;
+    float  lj_shift    = nbparam.sh_invrc6;
 #ifdef EL_EWALD_ANY
-    float beta        = nbparam.ewald_beta;
-    float ewald_shift = nbparam.sh_ewald;
+    float  beta        = nbparam.ewald_beta;
+    float  ewald_shift = nbparam.sh_ewald;
 #else
-    float c_rf        = nbparam.c_rf;
+    float  c_rf        = nbparam.c_rf;
 #endif
     float *e_lj       = atdat.e_lj;
     float *e_el       = atdat.e_el;
@@ -130,24 +130,24 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
     unsigned int bidx   = blockIdx.x;
     unsigned int widx   = tidx / WARP_SIZE; /* warp index */
 
-    int sci, ci, cj, ci_offset,
-        ai, aj,
-        cij4_start, cij4_end,
-        typei, typej,
-        i, jm, j4, wexcl_idx;
-    float qi, qj_f,
-          r2, inv_r, inv_r2, inv_r6,
-          c6, c12,
-          int_bit,
+    int          sci, ci, cj, ci_offset,
+                 ai, aj,
+                 cij4_start, cij4_end,
+                 typei, typej,
+                 i, jm, j4, wexcl_idx;
+    float        qi, qj_f,
+                 r2, inv_r, inv_r2, inv_r6,
+                 c6, c12,
+                 int_bit,
+                 F_invr;
 #ifdef CALC_ENERGIES
-          E_lj, E_el, E_lj_p,
+    float        E_lj, E_el, E_lj_p;
 #endif
-          F_invr;
     unsigned int wexcl, imask, mask_ji;
-    float4 xqbuf;
-    float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
-    float3 fci_buf[NCL_PER_SUPERCL];    /* i force buffer */
-    nbnxn_sci_t nb_sci;
+    float4       xqbuf;
+    float3       xi, xj, rv, f_ij, fcj_buf, fshift_buf;
+    float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+    nbnxn_sci_t  nb_sci;
 
     /* shmem buffer for i x+q pre-loading */
     extern __shared__  float4 xqib[];
@@ -185,7 +185,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #endif
     __syncthreads();
 
-    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
     {
         fci_buf[ci_offset] = make_float3(0.0f);
     }
@@ -268,11 +268,11 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD_ANY || defined EL_RF))
 #pragma unroll 8
 #endif
-                    for(i = 0; i < NCL_PER_SUPERCL; i++)
+                    for (i = 0; i < NCL_PER_SUPERCL; i++)
                     {
                         if (imask & mask_ji)
                         {
-                            ci_offset   = i;    /* i force buffer offset */
+                            ci_offset   = i;                     /* i force buffer offset */
 
                             ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
                             ai      = ci * CL_SIZE + tidxi;      /* i atom index */
@@ -320,7 +320,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #else
                                 c6      = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej));
                                 c12     = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1);
-#endif /* USE_TEXOBJ */
+#endif                          /* USE_TEXOBJ */
 
 
                                 /* avoid NaN for excluded pairs at r=0 */
@@ -344,7 +344,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #ifdef VDW_CUTOFF_CHECK
                                 /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
                                 vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
-                                F_invr  *= vdw_in_range;
+                                F_invr      *= vdw_in_range;
 #ifdef CALC_ENERGIES
                                 E_lj_p  *= vdw_in_range;
 #endif
@@ -382,7 +382,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #ifdef EL_EWALD_ANY
                                 /* 1.0f - erff is faster than erfcf */
                                 E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
-#endif /* EL_EWALD_ANY */
+#endif                          /* EL_EWALD_ANY */
 #endif
                                 f_ij    = rv * F_invr;
 
@@ -420,7 +420,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
     }
 
     /* reduce i forces */
-    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
     {
         ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 #ifdef REDUCE_SHUFFLE
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index 66f16c430e..bf7c8cfacf 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -58,12 +58,12 @@ static inline __device__
 float interpolate_coulomb_force_r(float r, float scale)
 {
     float   normalized = scale * r;
-    int     index = (int) normalized;
-    float   fract2 = normalized - index;
-    float   fract1 = 1.0f - fract2;
+    int     index      = (int) normalized;
+    float   fract2     = normalized - index;
+    float   fract1     = 1.0f - fract2;
 
-    return  fract1 * tex1Dfetch(coulomb_tab_texref, index)
-            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
+    return fract1 * tex1Dfetch(coulomb_tab_texref, index)
+           + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 }
 
 #ifdef TEXOBJ_SUPPORTED
@@ -72,12 +72,12 @@ float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
                                   float r, float scale)
 {
     float   normalized = scale * r;
-    int     index = (int) normalized;
-    float   fract2 = normalized - index;
-    float   fract1 = 1.0f - fract2;
+    int     index      = (int) normalized;
+    float   fract2     = normalized - index;
+    float   fract1     = 1.0f - fract2;
 
-    return  fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
-            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
+    return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
+           fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
 }
 #endif
 
@@ -101,7 +101,7 @@ float pmecorrF(float z2)
     const float FD0 = 1.0f;
 
     float       z4;
-    float       polyFN0,polyFN1,polyFD0,polyFD1;
+    float       polyFN0, polyFN1, polyFD0, polyFD1;
 
     z4          = z2*z2;
 
@@ -344,16 +344,16 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
 #pragma unroll 5
     for (i = 0; i < 5; i++)
     {
-        E_lj += __shfl_down(E_lj,sh);
-        E_el += __shfl_down(E_el,sh);
-        sh += sh;
+        E_lj += __shfl_down(E_lj, sh);
+        E_el += __shfl_down(E_el, sh);
+        sh   += sh;
     }
 
     /* The first thread in the warp writes the reduced energies */
     if (tidx == 0 || tidx == WARP_SIZE)
     {
-        atomicAdd(e_lj,E_lj);
-        atomicAdd(e_el,E_el);
+        atomicAdd(e_lj, E_lj);
+        atomicAdd(e_el, E_el);
     }
 }
 #endif /* __CUDA_ARCH__ */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index 608f826b77..45640b17c1 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -46,14 +46,14 @@
 
 /* Analytical plain cut-off kernels */
 #define EL_CUTOFF
-#define NB_KERNEL_FUNC_NAME(x,...) x##_cutoff##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _cutoff ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef EL_CUTOFF
 #undef NB_KERNEL_FUNC_NAME
 
 /* Analytical reaction-field kernels */
 #define EL_RF
-#define NB_KERNEL_FUNC_NAME(x,...) x##_rf##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _rf ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef EL_RF
 #undef NB_KERNEL_FUNC_NAME
@@ -61,7 +61,7 @@
 /* Analytical Ewald interaction kernels
  */
 #define EL_EWALD_ANA
-#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ewald ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef EL_EWALD_ANA
 #undef NB_KERNEL_FUNC_NAME
@@ -70,7 +70,7 @@
  */
 #define EL_EWALD_ANA
 #define VDW_CUTOFF_CHECK
-#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_twin##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ewald_twin ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef EL_EWALD_ANA
 #undef VDW_CUTOFF_CHECK
@@ -78,7 +78,7 @@
 
 /* Tabulated Ewald interaction kernels */
 #define EL_EWALD_TAB
-#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ewald_tab ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef EL_EWALD_TAB
 #undef NB_KERNEL_FUNC_NAME
@@ -86,7 +86,7 @@
 /* Tabulated Ewald interaction kernels with twin-range cut-off */
 #define EL_EWALD_TAB
 #define VDW_CUTOFF_CHECK
-#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab_twin##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ewald_tab_twin ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef EL_EWALD_TAB
 #undef VDW_CUTOFF_CHECK
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index ac9bea9dd2..f8d7a8d076 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -126,13 +126,13 @@ struct cu_nbparam
     float    sh_invrc6;      /**< LJ potential correction term                      */
 
     /* Non-bonded parameters - accessed through texture memory */
-    float            *nbfp;          /**< nonbonded parameter table with C6/C12 pairs  */
+    float              *nbfp;        /**< nonbonded parameter table with C6/C12 pairs  */
     cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp                 */
 
     /* Ewald Coulomb force table data - accessed through texture memory */
-    int               coulomb_tab_size;      /**< table size (s.t. it fits in texture cache) */
-    float             coulomb_tab_scale;     /**< table scale/spacing                        */
-    float            *coulomb_tab;           /**< pointer to the table in the device memory  */
+    int                  coulomb_tab_size;   /**< table size (s.t. it fits in texture cache) */
+    float                coulomb_tab_scale;  /**< table scale/spacing                        */
+    float               *coulomb_tab;        /**< pointer to the table in the device memory  */
     cudaTextureObject_t  coulomb_tab_texobj; /**< texture object bound to coulomb_tab        */
 };
 
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
index 1740ccac8b..8ba1726f2e 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@ -159,7 +159,7 @@ gmx_sum_simd2(gmx_mm_pr x, real* b)
     return b[0]+b[1];
 }
 
-#if GMX_SIMD_WIDTH_HERE>=4
+#if GMX_SIMD_WIDTH_HERE >= 4
 static gmx_inline real
 gmx_sum_simd4(gmx_mm_pr4 x, real* b)
 {
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
index 0c02fe72a8..c54c6ae79a 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
@@ -176,7 +176,7 @@ prepare_table_load_buffer(const int *array)
    and low/high half after each other, then simply doing a gather for tab_coul_F and tab_coul_F+1.
    The ording of the 16 elements doesn't matter, so it doesn't help to get FD sorted as odd/even
    instead of low/high.
-*/
+ */
 static gmx_inline void
 load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int *ti,
              gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S)
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
index 1422295d5e..bbff9ed368 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
@@ -701,7 +701,7 @@
 #else
     fscal_S0    = gmx_mul_pr(rinvsq_S0,
                              (
-                                        gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+                                 gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
 #endif
 #else
     fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
@@ -714,7 +714,7 @@
 #else
     fscal_S2    = gmx_mul_pr(rinvsq_S2,
                              (
-                                        gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+                                 gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
 #endif
 #else
     /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
index 6da784bc8c..0817d76793 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
@@ -374,7 +374,7 @@
 #endif
 #endif
 #else /* EXCL_FORCES */
-    /* No exclusion forces: remove all excluded atom pairs from the list */
+      /* No exclusion forces: remove all excluded atom pairs from the list */
     wco_S0      = gmx_and_pb(wco_S0, interact_S0);
     wco_S1      = gmx_and_pb(wco_S1, interact_S1);
     wco_S2      = gmx_and_pb(wco_S2, interact_S2);
@@ -906,7 +906,7 @@
 #else
     fscal_S0    = gmx_mul_pr(rinvsq_S0,
                              (
-                                        gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+                                 gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
 #endif
 #ifdef CALC_COULOMB
     fscal_S1    = gmx_mul_pr(rinvsq_S1,
@@ -915,7 +915,7 @@
 #else
     fscal_S1    = gmx_mul_pr(rinvsq_S1,
                              (
-                                        gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
+                                 gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
 #endif
 #else
     fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
@@ -929,7 +929,7 @@
 #else
     fscal_S2    = gmx_mul_pr(rinvsq_S2,
                              (
-                                        gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+                                 gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
 #endif
 #ifdef CALC_COULOMB
     fscal_S3    = gmx_mul_pr(rinvsq_S3,
@@ -938,7 +938,7 @@
 #else
     fscal_S3    = gmx_mul_pr(rinvsq_S3,
                              (
-                                        gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
+                                 gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
 #endif
 #else
     /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
diff --git a/src/gromacs/mdlib/ns.c b/src/gromacs/mdlib/ns.c
index 5d0e997afe..f776001169 100644
--- a/src/gromacs/mdlib/ns.c
+++ b/src/gromacs/mdlib/ns.c
@@ -405,9 +405,9 @@ static inline void new_i_nblist(t_nblist *nlist, atom_id i_atom, int shift, int
     else
     {
         /* Adding to previous list. First remove possible previous padding */
-        if(nlist->simd_padding_width>1)
+        if (nlist->simd_padding_width > 1)
         {
-            while(nlist->nrj>0 && nlist->jjnr[nlist->nrj-1]<0)
+            while (nlist->nrj > 0 && nlist->jjnr[nlist->nrj-1] < 0)
             {
                 nlist->nrj--;
             }
diff --git a/src/gromacs/mdlib/pme_simd4.h b/src/gromacs/mdlib/pme_simd4.h
index d432a82a5b..1b6f0b0314 100644
--- a/src/gromacs/mdlib/pme_simd4.h
+++ b/src/gromacs/mdlib/pme_simd4.h
@@ -117,14 +117,14 @@
     for (ithx = 0; (ithx < 4); ithx++)
     {
         index_x  = (i0+ithx)*pny*pnz;
-        tx_S   = gmx_simd4_set1_pr(thx[ithx]);
-        dx_S   = gmx_simd4_set1_pr(dthx[ithx]);
+        tx_S     = gmx_simd4_set1_pr(thx[ithx]);
+        dx_S     = gmx_simd4_set1_pr(dthx[ithx]);
 
         for (ithy = 0; (ithy < 4); ithy++)
         {
             index_xy = index_x+(j0+ithy)*pnz;
-            ty_S   = gmx_simd4_set1_pr(thy[ithy]);
-            dy_S   = gmx_simd4_set1_pr(dthy[ithy]);
+            ty_S     = gmx_simd4_set1_pr(thy[ithy]);
+            dy_S     = gmx_simd4_set1_pr(dthy[ithy]);
 
             gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0);
 
@@ -155,8 +155,8 @@
  * This code supports pme_order <= 5.
  */
 {
-    int    offset;
-    int    index;
+    int          offset;
+    int          index;
     gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
     gmx_simd4_pr tz_S0;
     gmx_simd4_pr tz_S1;
@@ -263,7 +263,7 @@
  * This code supports pme_order <= 5.
  */
 {
-    int    offset;
+    int          offset;
 
     real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
 
@@ -316,14 +316,14 @@
     for (ithx = 0; (ithx < PME_ORDER); ithx++)
     {
         index_x  = (i0+ithx)*pny*pnz;
-        tx_S   = gmx_simd4_set1_pr(thx[ithx]);
-        dx_S   = gmx_simd4_set1_pr(dthx[ithx]);
+        tx_S     = gmx_simd4_set1_pr(thx[ithx]);
+        dx_S     = gmx_simd4_set1_pr(dthx[ithx]);
 
         for (ithy = 0; (ithy < PME_ORDER); ithy++)
         {
             index_xy = index_x+(j0+ithy)*pnz;
-            ty_S   = gmx_simd4_set1_pr(thy[ithy]);
-            dy_S   = gmx_simd4_set1_pr(dthy[ithy]);
+            ty_S     = gmx_simd4_set1_pr(thy[ithy]);
+            dy_S     = gmx_simd4_set1_pr(dthy[ithy]);
 
             gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset);
             gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4);
diff --git a/src/gromacs/mdlib/qm_orca.c b/src/gromacs/mdlib/qm_orca.c
index 0d5535a2d1..f9f87ce69c 100644
--- a/src/gromacs/mdlib/qm_orca.c
+++ b/src/gromacs/mdlib/qm_orca.c
@@ -108,10 +108,10 @@ void init_orca(t_QMrec *qm)
 
 void write_orca_input(t_forcerec *fr, t_QMrec *qm, t_MMrec *mm)
 {
-    int i;
+    int        i;
     t_QMMMrec *QMMMrec;
-    FILE *out, *pcFile, *addInputFile, *LJCoeff;
-    char *buf, *orcaInput, *addInputFilename, *LJCoeffFilename, *pcFilename, *exclInName, *exclOutName;
+    FILE      *out, *pcFile, *addInputFile, *LJCoeff;
+    char      *buf, *orcaInput, *addInputFilename, *LJCoeffFilename, *pcFilename, *exclInName, *exclOutName;
 
     QMMMrec = fr->qr;
 
diff --git a/src/gromacs/simd/macros.h b/src/gromacs/simd/macros.h
index f1f848ba15..ca1e499a61 100644
--- a/src/gromacs/simd/macros.h
+++ b/src/gromacs/simd/macros.h
@@ -757,10 +757,10 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_atan2_pr(gmx_mm_pr a, gmx_mm_p
 
 static gmx_inline gmx_mm_pr gmx_always_inline gmx_erfc_pr(gmx_mm_pr a)
 {
-  /* The BG/Q qpxmath.h vector math library intended for use with
-     bgclang does not have erfc, so we need to use a function from
-     mass_simd.h. If this changes, then the #include <mass_simd.h> can
-     become conditional. */ 
+    /* The BG/Q qpxmath.h vector math library intended for use with
+       bgclang does not have erfc, so we need to use a function from
+       mass_simd.h. If this changes, then the #include <mass_simd.h> can
+       become conditional. */
 #ifndef GMX_DOUBLE
     return erfcf4(a);
 #else
diff --git a/src/gromacs/simd/math_single.h b/src/gromacs/simd/math_single.h
index dc42e0ca0c..377855c549 100644
--- a/src/gromacs/simd/math_single.h
+++ b/src/gromacs/simd/math_single.h
@@ -56,7 +56,7 @@ gmx_invsqrt_pr(gmx_mm_pr x)
     const gmx_mm_pr three = gmx_set1_pr(3.0);
 
     gmx_mm_pr       lu = gmx_rsqrt_pr(x);
-    
+
     return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu));
 #endif
 }
diff --git a/src/gromacs/tools/dump.c b/src/gromacs/tools/dump.c
index 17aee79cf0..452f2c5e39 100644
--- a/src/gromacs/tools/dump.c
+++ b/src/gromacs/tools/dump.c
@@ -319,15 +319,15 @@ void list_xtc(const char *fn)
 
 /*! \brief Callback used by list_tng_for_gmx_dump. */
 static void list_tng_inner(const char *fn,
-                    gmx_bool bFirstFrame,
-                    real *values,
-                    gmx_int64_t step, 
-                    double frame_time,
-                    gmx_int64_t n_values_per_frame,
-                    gmx_int64_t n_atoms,
-                    real prec,
-                    gmx_int64_t nframe,
-                    char *block_name)
+                           gmx_bool    bFirstFrame,
+                           real       *values,
+                           gmx_int64_t step,
+                           double      frame_time,
+                           gmx_int64_t n_values_per_frame,
+                           gmx_int64_t n_atoms,
+                           real        prec,
+                           gmx_int64_t nframe,
+                           char       *block_name)
 {
     char                 buf[256];
     int                  indent = 0;
diff --git a/src/gromacs/utility/init.cpp b/src/gromacs/utility/init.cpp
index 03569a6d99..2413ca5b7d 100644
--- a/src/gromacs/utility/init.cpp
+++ b/src/gromacs/utility/init.cpp
@@ -63,8 +63,8 @@ namespace gmx
 namespace
 {
 #ifdef GMX_LIB_MPI
-    //! Maintains global counter of attempts to initialize MPI
-    int g_initializationCounter = 0;
+//! Maintains global counter of attempts to initialize MPI
+int g_initializationCounter = 0;
 #endif
 }
 
diff --git a/src/programs/mdrun/tests/compressed_x_output.cpp b/src/programs/mdrun/tests/compressed_x_output.cpp
index 2b12049968..bfaf6b8cf5 100644
--- a/src/programs/mdrun/tests/compressed_x_output.cpp
+++ b/src/programs/mdrun/tests/compressed_x_output.cpp
@@ -52,7 +52,7 @@ namespace
 
 //! Test fixture for mdrun -x
 class CompressedXOutputTest : public gmx::test::MdrunTestFixture,
-                      public testing::WithParamInterface<const char*>
+                              public testing::WithParamInterface<const char*>
 {
 };
 
diff --git a/src/programs/mdrun/tests/rerun.cpp b/src/programs/mdrun/tests/rerun.cpp
index 8ba5d6b559..d1df11f8a3 100644
--- a/src/programs/mdrun/tests/rerun.cpp
+++ b/src/programs/mdrun/tests/rerun.cpp
@@ -92,7 +92,7 @@ const char *trajectoryFileNames[] = {
 
 INSTANTIATE_TEST_CASE_P(NoFatalErrorFrom,
                         MdrunRerun,
-                        ::testing::ValuesIn(gmx::ArrayRef<const char*>(trajectoryFileNames)));
+                            ::testing::ValuesIn(gmx::ArrayRef<const char*>(trajectoryFileNames)));
 
 /*! \todo Add other tests for mdrun -rerun, e.g.
  *
-- 
2.22.0