Uncrustify all files

author Roland Schulz <roland@utk.edu>

Sat, 18 Jan 2014 04:47:30 +0000 (23:47 -0500)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Wed, 29 Jan 2014 19:19:43 +0000 (20:19 +0100)
author Roland Schulz <roland@utk.edu>
Sat, 18 Jan 2014 04:47:30 +0000 (23:47 -0500)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 29 Jan 2014 19:19:43 +0000 (20:19 +0100)
diff --git a/.gitattributes b/.gitattributes

index af6711a0b682cef2b80dc3afdc2e15349d66ef56..1e099235375a89d81893605945efaa2def6ad0a2 100644 (file)
--- a/.gitattributes
+++ b/.gitattributes
@@ -14,7 +14,7 @@ CMakeLists.txt  filter=copyright
  *.tex           filter=copyright
  *.bm            filter=copyright
  # Exceptions: extra files to include
-admin/uncrustify.sh                     filter=copyright
+admin/*.sh                              filter=copyright
  admin/git-pre-commit                    filter=copyright
  # Exceptions: files to exclude
  *.pc.cmakein                            !filter
diff --git a/admin/reformat_all.sh b/admin/reformat_all.sh

new file mode 100755 (executable)

index 0000000..23611ea
--- /dev/null
+++ b/admin/reformat_all.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2014, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+function usage() {
+    echo "usage: reformat_all.sh [-f|--force]"
+    echo "           [--filter=(uncrustify|copyright)] [<action>]"
+    echo "<action>: (list-files|uncrustify*|copyright) (*=default)"
+}
+
+filter=default
+force=
+action=uncrustify
+for arg in "$@" ; do
+    if [[ "$arg" == "list-files" || "$arg" == "uncrustify" ||
+          "$arg" == "copyright" ]] ; then
+        action=$arg
+    elif [[ "$arg" == --filter=* ]] ; then
+        filter=${arg#--filter=}
+    elif [[ "$arg" == "-f" || "$arg" == "--force" ]] ; then
+        force=1
+    else
+        echo "Unknown option: $arg"
+        echo
+        usage
+        exit 2
+    fi
+done
+
+if [[ ! "$force" && "$action" != "list-files" ]] ; then
+    if ! git diff-files --quiet ; then
+        echo "Modified files found in work tree. Use -f to override."
+        exit 1
+    fi
+fi
+
+case "$action" in
+    list-files)
+        command=cat
+        ;;
+    uncrustify)
+        # Check that uncrustify is present
+        if [ -z "$UNCRUSTIFY" ] ; then
+            echo "Please set the path to uncrustify using UNCRUSTIFY."
+            echo "Note that you need a custom version of uncrustify."
+            echo "See comments in the script file for how to get one."
+            exit 2
+        fi
+        if ! which "$UNCRUSTIFY" 1>/dev/null ; then
+            echo "Uncrustify not found: $UNCRUSTIFY"
+            exit 2
+        fi
+        command="xargs $UNCRUSTIFY -c admin/uncrustify.cfg --no-backup"
+        ;;
+    copyright)
+        command="xargs admin/copyright.py --check"
+        ;;
+    *)
+        echo "Unknown action: $action"
+        exit 2
+esac
+
+if [[ "$filter" == "default" ]] ; then
+    filter=$action
+fi
+
+case "$filter" in
+    uncrustify)
+        filter_re="(uncrustify|uncrustify_only)"
+        ;;
+    copyright)
+        filter_re="(uncrustify|copyright)"
+        ;;
+    *)
+        echo "Unknown filter mode: $filter"
+        echo
+        usage
+        exit 2
+esac
+
+cd `git rev-parse --show-toplevel`
+
+if ! git ls-tree -r --name-only HEAD | git check-attr --stdin filter | \
+    sed -nEe "/${filter_re}$/ {s/:.*//;p;}" | $command ; then
+    echo "The reformatting command failed! Please check the output."
+    exit 1
+fi
diff --git a/src/gromacs/fileio/gmxfio_int.h b/src/gromacs/fileio/gmxfio_int.h

index b0ffbc3349178888cff4974cbf3bb95c2c45b07c..9399a966b91934962b16734fe35a8a86ef02b52d 100644 (file)
--- a/src/gromacs/fileio/gmxfio_int.h
+++ b/src/gromacs/fileio/gmxfio_int.h
@@ -85,14 +85,14 @@ struct t_fileio
                      bDebug,            /* the file ops should come with debug info */
                      bStdio,            /* the file is actually stdin or stdout */
                      bReadWrite;        /* the file is open for reading and writing */
-    char       *fn;                    /* the file name */
-    XDR        *xdr;                   /* the xdr data pointer */
-    enum xdr_op xdrmode;               /* the xdr mode */
-    int         iFTP;                  /* the file type identifier */
+    char        *fn;                   /* the file name */
+    XDR         *xdr;                  /* the xdr data pointer */
+    enum xdr_op  xdrmode;              /* the xdr mode */
+    int          iFTP;                 /* the file type identifier */
  
-    const char *comment;               /* a comment string for debugging */
+    const char  *comment;              /* a comment string for debugging */
  
-    t_fileio   *next, *prev;           /* next and previous file pointers in the
+    t_fileio    *next, *prev;          /* next and previous file pointers in the
                                            linked list */
      tMPI_Lock_t  mtx;                  /* content locking mutex. This is a fast lock
                                            for performance reasons: in some cases every
diff --git a/src/gromacs/fileio/mdoutf.c b/src/gromacs/fileio/mdoutf.c

index eb08027c559977db439d9b355e0d58bbd33bc698..3e17d3b1119e91e335d4328fba02b05d78e901eb 100644 (file)
--- a/src/gromacs/fileio/mdoutf.c
+++ b/src/gromacs/fileio/mdoutf.c
@@ -86,10 +86,10 @@ gmx_mdoutf_t init_mdoutf(int nfile, const t_filenm fnm[], int mdrun_flags,
      of->fp_dhdl      = NULL;
      of->fp_field     = NULL;
  
-    of->eIntegrator     = ir->eI;
-    of->bExpanded       = ir->bExpanded;
-    of->elamstats       = ir->expandedvals->elamstats;
-    of->simulation_part = ir->simulation_part;
+    of->eIntegrator             = ir->eI;
+    of->bExpanded               = ir->bExpanded;
+    of->elamstats               = ir->expandedvals->elamstats;
+    of->simulation_part         = ir->simulation_part;
      of->x_compression_precision = ir->x_compression_precision;
  
      if (MASTER(cr))
@@ -190,8 +190,8 @@ gmx_mdoutf_t init_mdoutf(int nfile, const t_filenm fnm[], int mdrun_flags,
             trajectory-writing routines later. Also, XTC writing needs
             to know what (and how many) atoms might be in the XTC
             groups, and how to look up later which ones they are. */
-        of->natoms_global = top_global->natoms;
-        of->groups        = &top_global->groups;
+        of->natoms_global       = top_global->natoms;
+        of->groups              = &top_global->groups;
          of->natoms_x_compressed = 0;
          for (i = 0; (i < top_global->natoms); i++)
          {
diff --git a/src/gromacs/fileio/timecontrol.c b/src/gromacs/fileio/timecontrol.c

index 6d9aed68246a28f3ceb7aaef0e75a697beedc1d0..1f084a1a0db2dc34b4acd1a58c8a7a51cdbf283f 100644 (file)
--- a/src/gromacs/fileio/timecontrol.c
+++ b/src/gromacs/fileio/timecontrol.c
@@ -54,7 +54,7 @@ typedef struct {
      gmx_bool bSet;
  } t_timecontrol;
  
-static t_timecontrol timecontrol[TNR] = {
+static t_timecontrol       timecontrol[TNR] = {
      { 0, FALSE },
      { 0, FALSE },
      { 0, FALSE }
diff --git a/src/gromacs/fileio/tngio.cpp b/src/gromacs/fileio/tngio.cpp

index 44f68752bfa8d381a0c20aabfe7d442dcb8563d0..62c274df62f2861ae492cfe0ce9398c27188efd6 100644 (file)
--- a/src/gromacs/fileio/tngio.cpp
+++ b/src/gromacs/fileio/tngio.cpp
@@ -692,9 +692,9 @@ void gmx_fwrite_tng(tng_trajectory_t tng,
  #else
      static write_data_func_pointer           write_data           = tng_util_generic_with_time_write;
  #endif
-    double                  elapsedSeconds = elapsedPicoSeconds * PICO;
-    gmx_int64_t             nParticles;
-    char                    compression;
+    double                                   elapsedSeconds = elapsedPicoSeconds * PICO;
+    gmx_int64_t                              nParticles;
+    char                                     compression;
  
  
      if (!tng)
diff --git a/src/gromacs/fileio/tngio_for_tools.cpp b/src/gromacs/fileio/tngio_for_tools.cpp

index 9784f890dc51eb490beb8d86e9dc26c197be573d..eea391694a204f641ea009b7b6b4686f257da283 100644 (file)
--- a/src/gromacs/fileio/tngio_for_tools.cpp
+++ b/src/gromacs/fileio/tngio_for_tools.cpp
@@ -527,22 +527,22 @@ gmx_bool gmx_read_next_tng_frame(tng_trajectory_t            input,
                  for (int i = 0; i < DIM; i++)
                  {
                      convert_array_to_real_array((char *)(values) + size * i * DIM,
-                                                    (real *) fr->box[i],
-                                                    getDistanceScaleFactor(input),
-                                                    1,
-                                                    DIM,
-                                                    datatype);
+                                                (real *) fr->box[i],
+                                                getDistanceScaleFactor(input),
+                                                1,
+                                                DIM,
+                                                datatype);
                  }
                  fr->bBox = TRUE;
                  break;
              case TNG_TRAJ_POSITIONS:
                  srenew(fr->x, fr->natoms);
                  convert_array_to_real_array(values,
-                                                (real *) fr->x,
-                                                getDistanceScaleFactor(input),
-                                                fr->natoms,
-                                                DIM,
-                                                datatype);
+                                            (real *) fr->x,
+                                            getDistanceScaleFactor(input),
+                                            fr->natoms,
+                                            DIM,
+                                            datatype);
                  fr->bX = TRUE;
                  tng_util_frame_current_compression_get(input, blockId, &codecId, &prec);
                  /* This must be updated if/when more lossy compression methods are added */
@@ -555,11 +555,11 @@ gmx_bool gmx_read_next_tng_frame(tng_trajectory_t            input,
              case TNG_TRAJ_VELOCITIES:
                  srenew(fr->v, fr->natoms);
                  convert_array_to_real_array(values,
-                                                (real *) fr->v,
-                                                getDistanceScaleFactor(input),
-                                                fr->natoms,
-                                                DIM,
-                                                datatype);
+                                            (real *) fr->v,
+                                            getDistanceScaleFactor(input),
+                                            fr->natoms,
+                                            DIM,
+                                            datatype);
                  fr->bV = TRUE;
                  tng_util_frame_current_compression_get(input, blockId, &codecId, &prec);
                  /* This must be updated if/when more lossy compression methods are added */
@@ -572,11 +572,11 @@ gmx_bool gmx_read_next_tng_frame(tng_trajectory_t            input,
              case TNG_TRAJ_FORCES:
                  srenew(fr->f, fr->natoms);
                  convert_array_to_real_array(values,
-                                                (real *) fr->f,
-                                                getDistanceScaleFactor(input),
-                                                fr->natoms,
-                                                DIM,
-                                                datatype);
+                                            (real *) fr->f,
+                                            getDistanceScaleFactor(input),
+                                            fr->natoms,
+                                            DIM,
+                                            datatype);
                  fr->bF = TRUE;
                  break;
              case TNG_GMX_LAMBDA:
@@ -824,11 +824,11 @@ gmx_bool gmx_get_tng_data_next_frame_of_block_type(tng_trajectory_t     input,
      }
      snew(*values, sizeof(real) * *nValuesPerFrame * *nAtoms);
      convert_array_to_real_array(data,
-                                    *values,
-                                    getDistanceScaleFactor(input),
-                                    *nAtoms,
-                                    *nValuesPerFrame,
-                                    datatype);
+                                *values,
+                                getDistanceScaleFactor(input),
+                                *nAtoms,
+                                *nValuesPerFrame,
+                                datatype);
  
      tng_util_frame_current_compression_get(input, blockId, &codecId, &localPrec);
  
diff --git a/src/gromacs/gmxana/cmat.c b/src/gromacs/gmxana/cmat.c

index ee8438f32bf94ec574f6f5a6e67b72166f2f25a2..c0029e27d5c4797ea286c34955e5ae698ad83301 100644 (file)
--- a/src/gromacs/gmxana/cmat.c
+++ b/src/gromacs/gmxana/cmat.c
@@ -72,19 +72,19 @@ void copy_t_mat(t_mat *dst, t_mat *src)
  
      if (dst->nn != src->nn)
      {
-        fprintf(stderr, "t_mat structures not identical in size dst %d src %d\n",dst->nn,src->nn);
+        fprintf(stderr, "t_mat structures not identical in size dst %d src %d\n", dst->nn, src->nn);
          return;
      }
      dst->maxrms = src->maxrms;
      dst->minrms = src->minrms;
      dst->sumrms = src->sumrms;
-    for(i = 0; (i < src->nn); i++)
+    for (i = 0; (i < src->nn); i++)
      {
-        for(j = 0; (j < src->nn); j++)
+        for (j = 0; (j < src->nn); j++)
          {
              dst->mat[i][j] = src->mat[i][j];
          }
-        dst->erow[i] = src->erow[i];
+        dst->erow[i]  = src->erow[i];
          dst->m_ind[i] = src->m_ind[i];
      }
  }
diff --git a/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu b/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu

index 26eda4ed6595901167dea866be7013c32a119802..aa204263febfb1a4bebf6b2b9331e5793d3831ce 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
+++ b/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
@@ -45,13 +45,13 @@
  
  void gmx_print_version_info_gpu(FILE *fp)
  {
-    int cuda_driver,cuda_runtime;
-    fprintf(fp, "CUDA compiler:      %s\n",CUDA_NVCC_COMPILER_INFO);
-    fprintf(fp, "CUDA compiler flags:%s\n",CUDA_NVCC_COMPILER_FLAGS);
+    int cuda_driver, cuda_runtime;
+    fprintf(fp, "CUDA compiler:      %s\n", CUDA_NVCC_COMPILER_INFO);
+    fprintf(fp, "CUDA compiler flags:%s\n", CUDA_NVCC_COMPILER_FLAGS);
      cuda_driver = 0;
      cudaDriverGetVersion(&cuda_driver);
      cuda_runtime = 0;
      cudaRuntimeGetVersion(&cuda_runtime);
-    fprintf(fp, "CUDA driver:        %d.%d\n",cuda_driver/1000, cuda_driver%100);
-    fprintf(fp, "CUDA runtime:       %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
+    fprintf(fp, "CUDA driver:        %d.%d\n", cuda_driver/1000, cuda_driver%100);
+    fprintf(fp, "CUDA runtime:       %d.%d\n", cuda_runtime/1000, cuda_runtime%100);
  }
diff --git a/src/gromacs/gmxlib/cuda_tools/cudautils.cu b/src/gromacs/gmxlib/cuda_tools/cudautils.cu

index 467c3ce0bb3a1143af52daa634d8217e1ec02412..e8f32b2332b685f763bfd8f80e81d7289e495934 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/cudautils.cu
+++ b/src/gromacs/gmxlib/cuda_tools/cudautils.cu
@@ -46,13 +46,15 @@
   *
   *  The copy is launched in stream s or if not specified, in stream 0.
   */
-static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes, 
+static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
                                 bool bAsync = false, cudaStream_t s = 0)
  {
      cudaError_t stat;
-    
+
      if (h_dest == NULL || d_src == NULL || bytes == 0)
+    {
          return -1;
+    }
  
      if (bAsync)
      {
@@ -83,9 +85,11 @@ int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s
  }
  
  int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
-{ 
+{
      if (h_dest == NULL || d_src == NULL || bytes == 0)
+    {
          return -1;
+    }
  
      smalloc(*h_dest, bytes);
  
@@ -96,13 +100,15 @@ int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
   *
   *  The copy is launched in stream s or if not specified, in stream 0.
   */
-static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes, 
+static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
                                 bool bAsync = false, cudaStream_t s = 0)
  {
      cudaError_t stat;
  
      if (d_dest == NULL || h_src == NULL || bytes == 0)
+    {
          return -1;
+    }
  
      if (bAsync)
      {
@@ -119,7 +125,7 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
  }
  
  int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
-{   
+{
      return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
  }
  
@@ -127,7 +133,7 @@ int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
   *  The copy is launched in stream s or if not specified, in stream 0.
   */
  int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
-{   
+{
      return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
  }
  
@@ -136,7 +142,9 @@ int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
      cudaError_t stat;
  
      if (d_dest == NULL || h_src == NULL || bytes == 0)
+    {
          return -1;
+    }
  
      stat = cudaMalloc(d_dest, bytes);
      CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
@@ -146,7 +154,7 @@ int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
  
  float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
  {
-    float t = 0.0;
+    float       t = 0.0;
      cudaError_t stat;
  
      stat = cudaEventElapsedTime(&t, start, end);
@@ -165,10 +173,10 @@ int cu_wait_event(cudaEvent_t e)
      return 0;
  }
  
-/*! 
+/*!
   *  If time != NULL it also calculates the time elapsed between start and end and
   *  return this is milliseconds.
- */ 
+ */
  int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
  {
      cudaError_t s;
@@ -211,11 +219,11 @@ void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
  }
  
  /*!
- *  Reallocation of the memory pointed by d_ptr and copying of the data from 
- *  the location pointed by h_src host-side pointer is done. Allocation is 
- *  buffered and therefore freeing is only needed if the previously allocated 
+ *  Reallocation of the memory pointed by d_ptr and copying of the data from
+ *  the location pointed by h_src host-side pointer is done. Allocation is
+ *  buffered and therefore freeing is only needed if the previously allocated
   *  space is not enough.
- *  The H2D copy is launched in stream s and can be done synchronously or 
+ *  The H2D copy is launched in stream s and can be done synchronously or
   *  asynchronously (the default is the latter).
   */
  void cu_realloc_buffered(void **d_dest, void *h_src,
@@ -232,7 +240,7 @@ void cu_realloc_buffered(void **d_dest, void *h_src,
          return;
      }
  
-    /* reallocate only if the data does not fit = allocation size is smaller 
+    /* reallocate only if the data does not fit = allocation size is smaller
         than the current requested size */
      if (req_size > *curr_alloc_size)
      {
diff --git a/src/gromacs/gmxlib/cuda_tools/cudautils.cuh b/src/gromacs/gmxlib/cuda_tools/cudautils.cuh

index 6ede841f3d5931d759d78cff87a5ef053b50f6f4..2cccc16c6714da4ead8366567f5d4089529decc3 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/cudautils.cuh
+++ b/src/gromacs/gmxlib/cuda_tools/cudautils.cuh
@@ -46,16 +46,16 @@
     with them (e.g. expected warp size = 32, check against the dev_info->props.warpsize). */
  #define WARP_SIZE           32
  
-/* TODO error checking needs to be rewritten. We have 2 types of error checks needed 
-   based on where they occur in the code: 
-   - non performance-critical: these errors are unsafe to be ignored and must be 
+/* TODO error checking needs to be rewritten. We have 2 types of error checks needed
+   based on where they occur in the code:
+   - non performance-critical: these errors are unsafe to be ignored and must be
       _always_ checked for, e.g. initializations
     - performance critical: handling errors might hurt performance so care need to be taken
-     when/if we should check for them at all, e.g. in cu_upload_X. However, we should be 
+     when/if we should check for them at all, e.g. in cu_upload_X. However, we should be
       able to turn the check for these errors on!
  
-  Probably we'll need two sets of the macros below... 
- 
+   Probably we'll need two sets of the macros below...
+
   */
  #define CHECK_CUDA_ERRORS
  
@@ -79,8 +79,8 @@
          } \
      } while (0)
  
-/*! Check for any previously occurred uncaught CUDA error 
-  -- aimed at use after kernel calls. */
+/*! Check for any previously occurred uncaught CUDA error
+   -- aimed at use after kernel calls. */
  #define CU_LAUNCH_ERR(msg) \
      do { \
          cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \
@@ -89,8 +89,8 @@
          } \
      } while (0)
  
-/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error 
-  -- aimed at use after kernel calls. */
+/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error
+   -- aimed at use after kernel calls. */
  #define CU_LAUNCH_ERR_SYNC(msg) \
      do { \
          cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \
@@ -106,7 +106,7 @@
  #define CU_LAUNCH_ERR(msg)      do { } while (0)
  #define CU_LAUNCH_ERR_SYNC(msg) do { } while (0)
  
-#endif /* CHECK_CUDA_ERRORS */ 
+#endif /* CHECK_CUDA_ERRORS */
  
  #ifdef __cplusplus
  extern "C" {
diff --git a/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu b/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu

index 4e2933dd46d4e099e6507d7330bd44b008d4c394..2566f9c4184f71d4a1e6ab2959a0157d50bc7b25 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu
+++ b/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu
@@ -40,8 +40,8 @@
  #include "cudautils.cuh"
  #include "pmalloc_cuda.h"
  
-/*! Allocates nbytes of page-locked memory. 
- *  This memory should always be freed using pfree (or with the page-locked 
+/*! Allocates nbytes of page-locked memory.
+ *  This memory should always be freed using pfree (or with the page-locked
   *  free functions provied by the CUDA library).
   */
  void pmalloc(void **h_ptr, size_t nbytes)
@@ -58,13 +58,13 @@ void pmalloc(void **h_ptr, size_t nbytes)
  
      CU_CHECK_PREV_ERR();
  
-    stat = cudaMallocHost(h_ptr, nbytes, flag);    
+    stat = cudaMallocHost(h_ptr, nbytes, flag);
      sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
-    CU_RET_ERR(stat, strbuf);  
+    CU_RET_ERR(stat, strbuf);
  }
  
-/*! Allocates nbytes of page-locked memory with write-combining. 
- *  This memory should always be freed using pfree (or with the page-locked 
+/*! Allocates nbytes of page-locked memory with write-combining.
+ *  This memory should always be freed using pfree (or with the page-locked
   *  free functions provied by the CUDA library).
   */
  void pmalloc_wc(void **h_ptr, size_t nbytes)
@@ -81,21 +81,21 @@ void pmalloc_wc(void **h_ptr, size_t nbytes)
  
      CU_CHECK_PREV_ERR();
  
-    stat = cudaMallocHost(h_ptr, nbytes, flag);    
+    stat = cudaMallocHost(h_ptr, nbytes, flag);
      sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
-    CU_RET_ERR(stat, strbuf);  
+    CU_RET_ERR(stat, strbuf);
  }
  
  /*! Frees page locked memory allocated with pmalloc.
- *  This function can safely be called also with a pointer to a page-locked 
+ *  This function can safely be called also with a pointer to a page-locked
   *  memory allocated directly with CUDA API calls.
   */
-void pfree(void *h_ptr) 
+void pfree(void *h_ptr)
  {
-    cudaError_t stat; 
+    cudaError_t stat;
  
      if (h_ptr == NULL)
-    {        
+    {
          return;
      }
  
diff --git a/src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh b/src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh

index a1413c17b643049dfda14c48609422c33cf96d8a..c7935510c73467d5c979b8de392da215b589f3ed 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh
+++ b/src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh
@@ -65,15 +65,15 @@ inline __host__ __device__ float3 operator*(float k, float3 a)
  {
      return make_float3(k * a.x, k * a.y, k * a.z);
  }
-inline __host__ __device__ void operator+=(float3 &a, float3 b)
+inline __host__ __device__ void operator += (float3 &a, float3 b)
  {
      a.x += b.x; a.y += b.y; a.z += b.z;
  }
-inline __host__ __device__ void operator+=(float3 &a, float4 b)
+inline __host__ __device__ void operator += (float3 &a, float4 b)
  {
      a.x += b.x; a.y += b.y; a.z += b.z;
  }
-inline __host__ __device__ void operator-=(float3 &a, float3 b)
+inline __host__ __device__ void operator -= (float3 &a, float3 b)
  {
      a.x -= b.x; a.y -= b.y; a.z -= b.z;
  }
@@ -93,7 +93,7 @@ inline __host__ __device__ float3 operator*(float3 a, float3 b)
  {
      return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
  }
-inline __host__ __device__ void operator*=(float3 &a, float3 b)
+inline __host__ __device__ void operator *= (float3 &a, float3 b)
  {
      a.x *= b.x; a.y *= b.y; a.z *= b.z;
  }
@@ -130,15 +130,15 @@ inline __host__ __device__ float4 operator*(float4 a, float k)
  {
      return make_float4(k * a.x, k * a.y, k * a.z, k * a.w);
  }
-inline __host__ __device__ void operator+=(float4 &a, float4 b)
+inline __host__ __device__ void operator += (float4 &a, float4 b)
  {
      a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
  }
-inline __host__ __device__ void operator+=(float4 &a, float3 b)
+inline __host__ __device__ void operator += (float4 &a, float3 b)
  {
      a.x += b.x; a.y += b.y; a.z += b.z;
  }
-inline __host__ __device__ void operator-=(float4 &a, float3 b)
+inline __host__ __device__ void operator -= (float4 &a, float3 b)
  {
      a.x -= b.x; a.y -= b.y; a.z -= b.z;
  }
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.c b/src/gromacs/gmxlib/gmx_detect_hardware.c

index 4332d1a70103735ae9b42e9efe86b388c0ab510c..2e56119e473fd6697683cb67cd612f9b901c4d76 100644 (file)
--- a/src/gromacs/gmxlib/gmx_detect_hardware.c
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.c
@@ -307,7 +307,7 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
          ngpu_comp = hwinfo->gpu_info.ncuda_dev_compatible;
          ngpu_use  = hw_opt->gpu_opt.ncuda_dev_use;
  
-        sprintf(gpu_comp_plural, "%s", (ngpu_comp> 1) ? "s" : "");
+        sprintf(gpu_comp_plural, "%s", (ngpu_comp > 1) ? "s" : "");
          sprintf(gpu_use_plural,  "%s", (ngpu_use > 1) ? "s" : "");
  
          /* number of tMPI threads auto-adjusted */
diff --git a/src/gromacs/gmxlib/gmx_fatal.c b/src/gromacs/gmxlib/gmx_fatal.c

index 58e980b3270e1ffd8c80e5d8da2a29de9a37215b..1481bde2ee6efba78ff33e4d9a8851e5fc02c954 100644 (file)
--- a/src/gromacs/gmxlib/gmx_fatal.c
+++ b/src/gromacs/gmxlib/gmx_fatal.c
@@ -57,9 +57,9 @@
  
  #include "gromacs/legacyheaders/thread_mpi/threads.h"
  
-static gmx_bool bDebug         = FALSE;
-static char    *fatal_tmp_file = NULL;
-static FILE    *log_file       = NULL;
+static gmx_bool            bDebug         = FALSE;
+static char               *fatal_tmp_file = NULL;
+static FILE               *log_file       = NULL;
  
  static tMPI_Thread_mutex_t debug_mutex     = TMPI_THREAD_MUTEX_INITIALIZER;
  static tMPI_Thread_mutex_t where_mutex     = TMPI_THREAD_MUTEX_INITIALIZER;
@@ -299,10 +299,10 @@ void gmx_fatal_collective(int f_errno, const char *file, int line,
                            const char *fmt, ...)
  {
      gmx_bool    bFinalize;
-    va_list ap;
-    char    msg[STRLEN];
+    va_list     ap;
+    char        msg[STRLEN];
  #ifdef GMX_MPI
-    int     result;
+    int         result;
  #endif
  
      bFinalize = TRUE;
@@ -388,8 +388,8 @@ void _unexpected_eof(const char *fn, int line, const char *srcfn, int srcline)
   * 0 to 3 of these filed are redirected to /dev/null
   *
   */
-FILE *debug           = NULL;
-gmx_bool gmx_debug_at = FALSE;
+FILE    *debug           = NULL;
+gmx_bool gmx_debug_at    = FALSE;
  
  void init_debug(const int dbglevel, const char *dbgfile)
  {
@@ -454,7 +454,7 @@ void doexceptions(void)
  
  static const char *gmxuser = "Please report this to the mailing list (gmx-users@gromacs.org)";
  
-static void (*gmx_error_handler)(const char *msg) = quit_gmx;
+static void        (*gmx_error_handler)(const char *msg) = quit_gmx;
  
  void set_gmx_error_handler(void (*func)(const char *msg))
  {
@@ -483,8 +483,8 @@ char *gmx_strerror(const char *key)
          { "range",  "Range checking error" }
      };
  #define NMSG asize(msg)
-    char buf[1024];
-    size_t i;
+    char        buf[1024];
+    size_t      i;
  
      if (key == NULL)
      {
@@ -514,9 +514,9 @@ char *gmx_strerror(const char *key)
  
  void _gmx_error(const char *key, const char *msg, const char *file, int line)
  {
-    char buf[10240], errerrbuf[1024];
+    char        buf[10240], errerrbuf[1024];
      const char *llines = "-------------------------------------------------------";
-    char *strerr;
+    char       *strerr;
  
      /* protect the audience from suggestive discussions */
  
@@ -564,7 +564,7 @@ void _range_check(int n, int n_min, int n_max, const char *warn_str,
  void gmx_warning(const char *fmt, ...)
  {
      va_list ap;
-    char msg[STRLEN];
+    char    msg[STRLEN];
  
      va_start(ap, fmt);
      vsprintf(msg, fmt, ap);
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

index 03d9ac2ea964a81803e4c152446805c048b81877..5c80ac2f2d773e82cbd357960c8abfba6af97c20 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
@@ -45,57 +45,67 @@
  #include "../cuda_tools/cudautils.cuh"
  #include "memtestG80_core.h"
  
+/** Amount of memory to be used in quick memtest. */
+#define QUICK_MEM       250
+/** Bit flag with type of tests to run in quick memtest. */
+#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
+/** Number of iterations in quick memtest. */
+#define QUICK_ITER      3
  
-#define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
-#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
-                                                                            to run in quick memtest. */
-#define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
+/** Bitflag with all test set on for full memetest. */
+#define FULL_TESTS      0x3FFF
+/** Number of iterations in full memtest. */
+#define FULL_ITER       25
  
-#define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
-#define FULL_ITER       25 /*!< Number of iterations in full memtest. */
+/** Bit flag with type of tests to run in time constrained memtest. */
+#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
  
-#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
-                                                                            run in time constrained memtest. */
-
-static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
-                                           In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
+/*! \brief
+ * Max number of devices supported by CUDA (for consistency checking).
+ *
+ * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
+ */
+static int cuda_max_device_count = 32;
  
-/*! Dummy kernel used for sanity checking. */
-__global__ void k_dummy_test(){}
+/** Dummy kernel used for sanity checking. */
+__global__ void k_dummy_test()
+{
+}
  
  
-/*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
+/** Bit-flags which refer to memtestG80 test types and are used in do_memtest
+ * to specify which tests to run. */
  enum memtest_G80_test_types {
-    MOVING_INVERSIONS_10 =      0x1,
-    MOVING_INVERSIONS_RAND =    0x2,
-    WALKING_8BIT_M86 =          0x4,
-    WALKING_0_8BIT =            0x8,
-    WALKING_1_8BIT =            0x10,
-    WALKING_0_32BIT =           0x20,
-    WALKING_1_32BIT =           0x40,
-    RANDOM_BLOCKS =             0x80,
-    MOD_20_32BIT =              0x100,
-    LOGIC_1_ITER =              0x200,
-    LOGIC_4_ITER =              0x400,
-    LOGIC_1_ITER_SHMEM =        0x800,
-    LOGIC_4_ITER_SHMEM =        0x1000
+    MOVING_INVERSIONS_10   = 0x1,
+    MOVING_INVERSIONS_RAND = 0x2,
+    WALKING_8BIT_M86       = 0x4,
+    WALKING_0_8BIT         = 0x8,
+    WALKING_1_8BIT         = 0x10,
+    WALKING_0_32BIT        = 0x20,
+    WALKING_1_32BIT        = 0x40,
+    RANDOM_BLOCKS          = 0x80,
+    MOD_20_32BIT           = 0x100,
+    LOGIC_1_ITER           = 0x200,
+    LOGIC_4_ITER           = 0x400,
+    LOGIC_1_ITER_SHMEM     = 0x800,
+    LOGIC_4_ITER_SHMEM     = 0x1000
  };
  
  
-/*! 
-  * \brief Runs GPU sanity checks.
-  *
-  * Runs a series of checks to determine that the given GPU and underlying CUDA
-  * driver/runtime functions properly.
-  * Returns properties of a device with given ID or the one that has
-  * already been initialized earlier in the case if of \dev_id == -1.
-  *
-  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
-  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
-  * \returns                0 if the device looks OK
-  *
-  * TODO: introduce errors codes and handle errors more smoothly.
-  */
+/*!
+ * \brief Runs GPU sanity checks.
+ *
+ * Runs a series of checks to determine that the given GPU and underlying CUDA
+ * driver/runtime functions properly.
+ * Returns properties of a device with given ID or the one that has
+ * already been initialized earlier in the case if of \dev_id == -1.
+ *
+ * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
+ * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
+ * \returns                0 if the device looks OK
+ *
+ * TODO: introduce errors codes and handle errors more smoothly.
+ */
  static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
  {
      cudaError_t cu_err;
@@ -104,18 +114,22 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
      cu_err = cudaGetDeviceCount(&dev_count);
      if (cu_err != cudaSuccess)
      {
-       fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
-               cudaGetErrorString(cu_err));
+        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
+                cudaGetErrorString(cu_err));
          return -1;
      }
  
      /* no CUDA compatible device at all */
      if (dev_count == 0)
+    {
          return -1;
+    }
  
      /* things might go horribly wrong if cudart is not compatible with the driver */
      if (dev_count < 0 || dev_count > cuda_max_device_count)
+    {
          return -1;
+    }
  
      if (dev_id == -1) /* device already selected let's not destroy the context */
      {
@@ -149,10 +163,14 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
  
      /* both major & minor is 9999 if no CUDA capable devices are present */
      if (dev_prop->major == 9999 && dev_prop->minor == 9999)
+    {
          return -1;
+    }
      /* we don't care about emulation mode */
      if (dev_prop->major == 0)
+    {
          return -1;
+    }
  
      if (id != -1)
      {
@@ -166,7 +184,7 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
      }
  
      /* try to execute a dummy kernel */
-    k_dummy_test<<<1, 512>>>();
+    k_dummy_test<<< 1, 512>>> ();
      if (cudaThreadSynchronize() != cudaSuccess)
      {
          return -1;
@@ -212,7 +230,9 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
  
      // let's try to allocate the mem
      while (!tester.allocate(megs) && (megs - 10 > 0))
-        { megs -= 10; tester.deallocate(); }
+    {
+        megs -= 10; tester.deallocate();
+    }
  
      if (megs <= 10)
      {
@@ -229,25 +249,31 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
          {
              tester.gpuMovingInversionsOnesZeros(err_count);
              if (err_count > 0)
+            {
                  return MOVING_INVERSIONS_10;
+            }
          }
          // Moving Inversions (random)
          if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
          {
              tester.gpuMovingInversionsRandom(err_count);
              if (err_count > 0)
+            {
                  return MOVING_INVERSIONS_RAND;
+            }
          }
-       // Memtest86 Walking 8-bit
+        // Memtest86 Walking 8-bit
          if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
          {
              for (uint shift = 0; shift < 8; shift++)
              {
                  tester.gpuWalking8BitM86(err_count, shift);
                  if (err_count > 0)
+                {
                      return WALKING_8BIT_M86;
+                }
              }
-      }
+        }
          // True Walking zeros (8-bit)
          if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
          {
@@ -255,7 +281,9 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
              {
                  tester.gpuWalking8Bit(err_count, false, shift);
                  if (err_count > 0)
+                {
                      return WALKING_0_8BIT;
+                }
              }
          }
          // True Walking ones (8-bit)
@@ -265,7 +293,9 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
              {
                  tester.gpuWalking8Bit(err_count, true, shift);
                  if (err_count > 0)
+                {
                      return WALKING_1_8BIT;
+                }
              }
          }
          // Memtest86 Walking zeros (32-bit)
@@ -275,25 +305,31 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
              {
                  tester.gpuWalking32Bit(err_count, false, shift);
                  if (err_count > 0)
+                {
                      return WALKING_0_32BIT;
+                }
              }
          }
-       // Memtest86 Walking ones (32-bit)
+        // Memtest86 Walking ones (32-bit)
          if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
          {
              for (uint shift = 0; shift < 32; shift++)
              {
                  tester.gpuWalking32Bit(err_count, true, shift);
                  if (err_count > 0)
+                {
                      return WALKING_1_32BIT;
+                }
              }
-       }
+        }
          // Random blocks
          if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
          {
-            tester.gpuRandomBlocks(err_count,rand());
+            tester.gpuRandomBlocks(err_count, rand());
              if (err_count > 0)
+            {
                  return RANDOM_BLOCKS;
+            }
  
          }
  
@@ -304,37 +340,47 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
              {
                  tester.gpuModuloX(err_count, shift, rand(), 20, 2);
                  if (err_count > 0)
+                {
                      return MOD_20_32BIT;
+                }
              }
          }
          // Logic (one iteration)
          if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
          {
-            tester.gpuShortLCG0(err_count,1);
+            tester.gpuShortLCG0(err_count, 1);
              if (err_count > 0)
+            {
                  return LOGIC_1_ITER;
+            }
          }
          // Logic (4 iterations)
          if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
          {
-            tester.gpuShortLCG0(err_count,4);
+            tester.gpuShortLCG0(err_count, 4);
              if (err_count > 0)
+            {
                  return LOGIC_4_ITER;
+            }
  
          }
          // Logic (shared memory, one iteration)
          if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
          {
-            tester.gpuShortLCG0Shmem(err_count,1);
+            tester.gpuShortLCG0Shmem(err_count, 1);
              if (err_count > 0)
+            {
                  return LOGIC_1_ITER_SHMEM;
+            }
          }
          // Logic (shared-memory, 4 iterations)
          if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
          {
-            tester.gpuShortLCG0Shmem(err_count,4);
+            tester.gpuShortLCG0Shmem(err_count, 4);
              if (err_count > 0)
+            {
                  return LOGIC_4_ITER_SHMEM;
+            }
          }
      }
  
@@ -353,9 +399,12 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
  int do_quick_memtest(int dev_id)
  {
      cudaDeviceProp  dev_prop;
-    int             devmem, res, time=0;
+    int             devmem, res, time = 0;
  
-    if (debug) { time = getTimeMilliseconds(); }
+    if (debug)
+    {
+        time = getTimeMilliseconds();
+    }
  
      if (do_sanity_checks(dev_id, &dev_prop) != 0)
      {
@@ -367,7 +416,7 @@ int do_quick_memtest(int dev_id)
      {
          devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
          fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
-            QUICK_MEM, devmem, QUICK_ITER);
+                QUICK_MEM, devmem, QUICK_ITER);
      }
  
      res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
@@ -379,7 +428,10 @@ int do_quick_memtest(int dev_id)
      }
  
      /* destroy context only if we created it */
-    if (dev_id !=-1) cudaThreadExit();
+    if (dev_id != -1)
+    {
+        cudaThreadExit();
+    }
      return res;
  }
  
@@ -395,9 +447,12 @@ int do_quick_memtest(int dev_id)
  int do_full_memtest(int dev_id)
  {
      cudaDeviceProp  dev_prop;
-    int             devmem, res, time=0;
+    int             devmem, res, time = 0;
  
-    if (debug) { time = getTimeMilliseconds(); }
+    if (debug)
+    {
+        time = getTimeMilliseconds();
+    }
  
      if (do_sanity_checks(dev_id, &dev_prop) != 0)
      {
@@ -407,10 +462,10 @@ int do_full_memtest(int dev_id)
  
      devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
  
-    if (debug) 
-    { 
+    if (debug)
+    {
          fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
-            devmem, devmem, FULL_ITER); 
+                devmem, devmem, FULL_ITER);
      }
  
      /* do all test on the entire memory */
@@ -423,7 +478,10 @@ int do_full_memtest(int dev_id)
      }
  
      /* destroy context only if we created it */
-    if (dev_id != -1) cudaThreadExit();
+    if (dev_id != -1)
+    {
+        cudaThreadExit();
+    }
      return res;
  }
  
@@ -440,12 +498,15 @@ int do_full_memtest(int dev_id)
  int do_timed_memtest(int dev_id, int time_constr)
  {
      cudaDeviceProp  dev_prop;
-    int             devmem, res=0, time=0, startt;
+    int             devmem, res = 0, time = 0, startt;
  
-    if (debug) { time = getTimeMilliseconds(); }
+    if (debug)
+    {
+        time = getTimeMilliseconds();
+    }
  
      time_constr *= 1000;  /* convert to ms for convenience */
-    startt = getTimeMilliseconds();
+    startt       = getTimeMilliseconds();
  
      if (do_sanity_checks(dev_id, &dev_prop) != 0)
      {
@@ -455,18 +516,21 @@ int do_timed_memtest(int dev_id, int time_constr)
  
      devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
  
-    if (debug) 
-    { 
+    if (debug)
+    {
          fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
-        devmem, devmem, time_constr); 
+                devmem, devmem, time_constr);
      }
  
-    /* do the TIMED_TESTS set, one step at a time on the entire memory 
+    /* do the TIMED_TESTS set, one step at a time on the entire memory
         that can be allocated, and stop when the given time is exceeded */
      while ( ((int)getTimeMilliseconds() - startt) < time_constr)
-    {        
+    {
          res = do_memtest(TIMED_TESTS, devmem, 1);
-        if (res != 0) break;
+        if (res != 0)
+        {
+            break;
+        }
      }
  
      if (debug)
@@ -476,7 +540,10 @@ int do_timed_memtest(int dev_id, int time_constr)
      }
  
      /* destroy context only if we created it */
-    if (dev_id != -1) cudaThreadExit();
+    if (dev_id != -1)
+    {
+        cudaThreadExit();
+    }
      return res;
  }
  
@@ -497,8 +564,8 @@ gmx_bool init_gpu(int mygpu, char *result_str,
                    const gmx_gpu_opt_t *gpu_opt)
  {
      cudaError_t stat;
-    char sbuf[STRLEN];
-    int gpuid;
+    char        sbuf[STRLEN];
+    int         gpuid;
  
      assert(gpu_info);
      assert(result_str);
@@ -507,7 +574,7 @@ gmx_bool init_gpu(int mygpu, char *result_str,
      {
          sprintf(sbuf, "Trying to initialize an inexistent GPU: "
                  "there are %d %s-selected GPU(s), but #%d was requested.",
-                 gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+                gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
          gmx_incons(sbuf);
      }
  
@@ -643,9 +710,9 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
   */
  int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  {
-    int             i, ndev, checkres, retval;
-    cudaError_t     stat;
-    cudaDeviceProp  prop;
+    int              i, ndev, checkres, retval;
+    cudaError_t      stat;
+    cudaDeviceProp   prop;
      cuda_dev_info_t *devs;
  
      assert(gpu_info);
@@ -666,7 +733,7 @@ int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
           * busy in exclusive mode, or some other condition which should
           * result in us issuing a warning a falling back to CPUs. */
          retval = -1;
-        s = cudaGetErrorString(stat);
+        s      = cudaGetErrorString(stat);
          strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
      }
      else
@@ -708,9 +775,9 @@ int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
   * \param[in,out] gpu_opt     pointer to structure holding GPU options
   */
  void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
-                          gmx_gpu_opt_t *gpu_opt)
+                          gmx_gpu_opt_t        *gpu_opt)
  {
-    int i, ncompat;
+    int  i, ncompat;
      int *compat;
  
      assert(gpu_info);
@@ -746,11 +813,11 @@ void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
   * \param[out]  gpu_opt     pointer to structure holding GPU options
   * \returns                 TRUE if every the requested GPUs are compatible
   */
-gmx_bool check_selected_cuda_gpus(int *checkres,
+gmx_bool check_selected_cuda_gpus(int                  *checkres,
                                    const gmx_gpu_info_t *gpu_info,
-                                  gmx_gpu_opt_t *gpu_opt)
+                                  gmx_gpu_opt_t        *gpu_opt)
  {
-    int i, id;
+    int  i, id;
      bool bAllOk;
  
      assert(checkres);
@@ -820,7 +887,7 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
  
      cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
  
-    bool bGpuExists =
+    bool             bGpuExists =
          dinfo->stat == egpuCompatible ||
          dinfo->stat == egpuIncompatible;
  
@@ -852,8 +919,8 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
   * \returns                 device ID of the requested GPU
   */
  int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
-                      const gmx_gpu_opt_t *gpu_opt,
-                      int idx)
+                      const gmx_gpu_opt_t  *gpu_opt,
+                      int                   idx)
  {
      assert(gpu_info);
      assert(gpu_opt);
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h

index 0be09945105d25905f5e1e864ffe56bccbcaae90..6f01d2dfef650b9e613aa8d19f21250fc6c6c978 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h
@@ -38,30 +38,30 @@
  /* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
  #include <emmintrin.h>
  
-#define GMX_FJSP_SHUFFLE2(x,y) (((x)<<1) | (y))
+#define GMX_FJSP_SHUFFLE2(x, y) (((x)<<1) | (y))
  
  #define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) {           \
-    _fjsp_v2r8 __gmx_t1 = row0;                          \
-    row0           = _fjsp_unpacklo_v2r8(row0,row1);     \
-    row1           = _fjsp_unpackhi_v2r8(__gmx_t1,row1); \
+        _fjsp_v2r8 __gmx_t1 = row0;                          \
+        row0           = _fjsp_unpacklo_v2r8(row0, row1);     \
+        row1           = _fjsp_unpackhi_v2r8(__gmx_t1, row1); \
  }
  
  
  static void
  gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
  {
-  double lo,hi;
+    double lo, hi;
  
-  _fjsp_storel_v2r8(&lo,a);
-  _fjsp_storeh_v2r8(&hi,a);
-  printf("%s: %g %g\n",s,lo,hi);
+    _fjsp_storel_v2r8(&lo, a);
+    _fjsp_storeh_v2r8(&hi, a);
+    printf("%s: %g %g\n", s, lo, hi);
  }
  
  
  static _fjsp_v2r8
  gmx_fjsp_set1_v2r8(double d)
  {
-    return _fjsp_set_v2r8(d,d);
+    return _fjsp_set_v2r8(d, d);
  }
  
  static _fjsp_v2r8
@@ -80,10 +80,10 @@ gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
          long long int    i;
      }
      conv;
-    
-    a = _fjsp_cmplt_v2r8(a,b);
-    a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a,a));
-    _fjsp_storel_v2r8(&(conv.d),a);
+
+    a = _fjsp_cmplt_v2r8(a, b);
+    a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a, a));
+    _fjsp_storel_v2r8(&(conv.d), a);
      return (conv.i != 0);
  }
  
@@ -93,19 +93,19 @@ gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
  {
      const _fjsp_v2r8 half  = gmx_fjsp_set1_v2r8(0.5);
      const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
-    _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
-    
-    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+    _fjsp_v2r8       lu    = _fjsp_rsqrta_v2r8(x);
+
+    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
      /* The HPC-ACE instruction set is only available in double precision, while
-     * single precision is typically sufficient for Gromacs. If you define 
-     * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson 
-     * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full 
+     * single precision is typically sufficient for Gromacs. If you define
+     * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
+     * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
       * double precision (53 bits). This is still clearly higher than single precision (24 bits).
       */
  #ifndef GMX_RELAXED_DOUBLE_PRECISION
-    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
  #endif
-    return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+    return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
  }
  
  
@@ -113,11 +113,11 @@ gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
  static gmx_inline _fjsp_v2r8
  gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
  {
-    const _fjsp_v2r8 two  = gmx_fjsp_set1_v2r8(2.0);    
-    __m128d lu = _fjsp_rcpa_v2r8(x);
-    
+    const _fjsp_v2r8 two  = gmx_fjsp_set1_v2r8(2.0);
+    __m128d          lu   = _fjsp_rcpa_v2r8(x);
+
      /* Perform three N-R steps for double precision */
-    lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+    lu         = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
      /* The HPC-ACE instruction set is only available in double precision, while
       * single precision is typically sufficient for Gromacs. If you define
       * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
@@ -125,20 +125,20 @@ gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
       * double precision (53 bits). This is still clearly higher than single precision (24 bits).
       */
  #ifndef GMX_RELAXED_DOUBLE_PRECISION
-    lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+    lu         = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
  #endif
-    return _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+    return _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
  }
  
  
  static gmx_inline _fjsp_v2r8
  gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
  {
-    return _fjsp_madd_v2r8(dx,dx,_fjsp_madd_v2r8(dy,dy,_fjsp_mul_v2r8(dz,dz)));
+    return _fjsp_madd_v2r8(dx, dx, _fjsp_madd_v2r8(dy, dy, _fjsp_mul_v2r8(dz, dz)));
  }
  
  /* Normal sum of four ymm registers */
-#define gmx_fjsp_sum4_v2r8(t0,t1,t2,t3)  _fjsp_add_v2r8(_fjsp_add_v2r8(t0,t1),_fjsp_add_v2r8(t2,t3))
+#define gmx_fjsp_sum4_v2r8(t0, t1, t2, t3)  _fjsp_add_v2r8(_fjsp_add_v2r8(t0, t1), _fjsp_add_v2r8(t2, t3))
  
  
  
@@ -148,263 +148,263 @@ static _fjsp_v2r8
  gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
                                   const double * gmx_restrict ptrB)
  {
-    return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA),_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
+    return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA), _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
  }
  
  static _fjsp_v2r8
  gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
  {
-    return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+    return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
  }
  
  
  static void
  gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
-                                double * gmx_restrict ptrB,
-                                _fjsp_v2r8 xmm1)
+                                  double * gmx_restrict ptrB,
+                                  _fjsp_v2r8            xmm1)
  {
      _fjsp_v2r8 t2;
-    
-    t2       = _fjsp_unpackhi_v2r8(xmm1,xmm1);
-    _fjsp_storel_v2r8(ptrA,xmm1);                                           
-    _fjsp_storel_v2r8(ptrB,t2);                                         
+
+    t2       = _fjsp_unpackhi_v2r8(xmm1, xmm1);
+    _fjsp_storel_v2r8(ptrA, xmm1);
+    _fjsp_storel_v2r8(ptrB, t2);
  }
  
  static void
  gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
  {
-    _fjsp_storel_v2r8(ptrA,xmm1);
+    _fjsp_storel_v2r8(ptrA, xmm1);
  }
  
  
  /* Similar to store, but increments value in memory */
  static void
  gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
-                                    double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
+                                      double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
  {
      _fjsp_v2r8 t1;
-    
-    t1   = _fjsp_unpackhi_v2r8(xmm1,xmm1);
-    xmm1 = _fjsp_add_v2r8(xmm1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA));
-    t1   = _fjsp_add_v2r8(t1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
-    _fjsp_storel_v2r8(ptrA,xmm1);
-    _fjsp_storel_v2r8(ptrB,t1);
+
+    t1   = _fjsp_unpackhi_v2r8(xmm1, xmm1);
+    xmm1 = _fjsp_add_v2r8(xmm1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA));
+    t1   = _fjsp_add_v2r8(t1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
+    _fjsp_storel_v2r8(ptrA, xmm1);
+    _fjsp_storel_v2r8(ptrB, t1);
  }
  
  static void
  gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
  {
      _fjsp_v2r8 tmp;
-    
-    tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
-    tmp = _fjsp_add_v2r8(tmp,xmm1);
-    _fjsp_storel_v2r8(ptrA,tmp);
+
+    tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
+    tmp = _fjsp_add_v2r8(tmp, xmm1);
+    _fjsp_storel_v2r8(ptrA, tmp);
  }
  
  
  
  static gmx_inline void
  gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
-                             const double * gmx_restrict p2,
-                             _fjsp_v2r8 * gmx_restrict c6,
-                             _fjsp_v2r8 * gmx_restrict c12)
+                                 const double * gmx_restrict p2,
+                                 _fjsp_v2r8 * gmx_restrict   c6,
+                                 _fjsp_v2r8 * gmx_restrict   c12)
  {
-    _fjsp_v2r8 t1,t2,t3;
-    
+    _fjsp_v2r8 t1, t2, t3;
+
      /* The c6/c12 array should be aligned */
      t1   = _fjsp_load_v2r8(p1);
      t2   = _fjsp_load_v2r8(p2);
-    *c6  = _fjsp_unpacklo_v2r8(t1,t2);  
-    *c12 = _fjsp_unpackhi_v2r8(t1,t2);                    
+    *c6  = _fjsp_unpacklo_v2r8(t1, t2);
+    *c12 = _fjsp_unpackhi_v2r8(t1, t2);
  }
  
  static gmx_inline void
  gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
-                             _fjsp_v2r8 * gmx_restrict c6,
-                             _fjsp_v2r8 * gmx_restrict c12)
+                                 _fjsp_v2r8 * gmx_restrict   c6,
+                                 _fjsp_v2r8 * gmx_restrict   c12)
  {
-    *c6     = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
-    *c12    = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+    *c6     = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
+    *c12    = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
  }
  
  
  static gmx_inline void
  gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         _fjsp_v2r8 * gmx_restrict x1,
-                                         _fjsp_v2r8 * gmx_restrict y1,
-                                         _fjsp_v2r8 * gmx_restrict z1)
+                                             const double * gmx_restrict xyz,
+                                             _fjsp_v2r8 * gmx_restrict   x1,
+                                             _fjsp_v2r8 * gmx_restrict   y1,
+                                             _fjsp_v2r8 * gmx_restrict   z1)
  {
-    _fjsp_v2r8 mem_xy,mem_z,mem_sxy,mem_sz;
-    
+    _fjsp_v2r8 mem_xy, mem_z, mem_sxy, mem_sz;
+
      mem_xy  = _fjsp_load_v2r8(xyz);
-    mem_z   = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+2);
+    mem_z   = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+2);
      mem_sxy = _fjsp_load_v2r8(xyz_shift);
-    mem_sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
-    
-    mem_xy  = _fjsp_add_v2r8(mem_xy,mem_sxy);
-    mem_z   = _fjsp_add_v2r8(mem_z,mem_sz);
-    
-    *x1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(0,0));
-    *y1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(1,1));
-    *z1  = _fjsp_shuffle_v2r8(mem_z,mem_z,GMX_FJSP_SHUFFLE2(0,0));
+    mem_sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
+
+    mem_xy  = _fjsp_add_v2r8(mem_xy, mem_sxy);
+    mem_z   = _fjsp_add_v2r8(mem_z, mem_sz);
+
+    *x1  = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(0, 0));
+    *y1  = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(1, 1));
+    *z1  = _fjsp_shuffle_v2r8(mem_z, mem_z, GMX_FJSP_SHUFFLE2(0, 0));
  }
  
  
  static gmx_inline void
  gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                         _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                         _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+                                             const double * gmx_restrict xyz,
+                                             _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                             _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                             _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
  {
-    _fjsp_v2r8 t1,t2,t3,t4,t5,sxy,sz,szx,syz;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5, sxy, sz, szx, syz;
+
      t1  = _fjsp_load_v2r8(xyz);
      t2  = _fjsp_load_v2r8(xyz+2);
      t3  = _fjsp_load_v2r8(xyz+4);
      t4  = _fjsp_load_v2r8(xyz+6);
-    t5  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+8);
-    
+    t5  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+8);
+
      sxy = _fjsp_load_v2r8(xyz_shift);
-    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
-    szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
-    syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
-    
-    t1  = _fjsp_add_v2r8(t1,sxy);
-    t2  = _fjsp_add_v2r8(t2,szx);
-    t3  = _fjsp_add_v2r8(t3,syz);
-    t4  = _fjsp_add_v2r8(t4,sxy);
-    t5  = _fjsp_add_v2r8(t5,sz);
-    
-    *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
-    *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
-    *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
-    *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
-    *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
-    *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
-    *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
-    *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
-    *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
+    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
+    szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
+    syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
+
+    t1  = _fjsp_add_v2r8(t1, sxy);
+    t2  = _fjsp_add_v2r8(t2, szx);
+    t3  = _fjsp_add_v2r8(t3, syz);
+    t4  = _fjsp_add_v2r8(t4, sxy);
+    t5  = _fjsp_add_v2r8(t5, sz);
+
+    *x1  = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
+    *y1  = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
+    *z1  = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
+    *x2  = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
+    *y2  = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
+    *z2  = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
+    *x3  = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
+    *y3  = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
+    *z3  = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
  }
  
  
  static gmx_inline void
  gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                         _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                         _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
-                                         _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
-{
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
-    
+                                             const double * gmx_restrict xyz,
+                                             _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                             _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                             _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                             _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
+
      t1  = _fjsp_load_v2r8(xyz);
      t2  = _fjsp_load_v2r8(xyz+2);
      t3  = _fjsp_load_v2r8(xyz+4);
      t4  = _fjsp_load_v2r8(xyz+6);
      t5  = _fjsp_load_v2r8(xyz+8);
      t6  = _fjsp_load_v2r8(xyz+10);
-    
+
      sxy = _fjsp_load_v2r8(xyz_shift);
-    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
-    szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
-    syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
-    
-    t1  = _fjsp_add_v2r8(t1,sxy);
-    t2  = _fjsp_add_v2r8(t2,szx);
-    t3  = _fjsp_add_v2r8(t3,syz);
-    t4  = _fjsp_add_v2r8(t4,sxy);
-    t5  = _fjsp_add_v2r8(t5,szx);
-    t6  = _fjsp_add_v2r8(t6,syz);
-    
-    *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
-    *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
-    *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
-    *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
-    *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
-    *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
-    *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
-    *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
-    *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
-    *x4  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(1,1));
-    *y4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(0,0));
-    *z4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(1,1));
+    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
+    szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
+    syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
+
+    t1  = _fjsp_add_v2r8(t1, sxy);
+    t2  = _fjsp_add_v2r8(t2, szx);
+    t3  = _fjsp_add_v2r8(t3, syz);
+    t4  = _fjsp_add_v2r8(t4, sxy);
+    t5  = _fjsp_add_v2r8(t5, szx);
+    t6  = _fjsp_add_v2r8(t6, syz);
+
+    *x1  = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
+    *y1  = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
+    *z1  = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
+    *x2  = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
+    *y2  = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
+    *z2  = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
+    *x3  = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
+    *y3  = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
+    *z3  = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
+    *x4  = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(1, 1));
+    *y4  = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(0, 0));
+    *z4  = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(1, 1));
  }
  
  
  
  static gmx_inline void
  gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
-                                  _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
+                                      _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
  {
-        *x            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
-     *y            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
-     *z            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+    *x            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
+    *y            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
+    *z            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
  }
  
  static gmx_inline void
  gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                      _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                      _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
  {
-        *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
-     *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
-     *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
-        *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
-     *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
-     *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
-        *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
-     *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
-     *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
+    *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
+    *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
+    *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
+    *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
+    *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
+    *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
+    *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
+    *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
+    *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
  }
  
  static gmx_inline void
  gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
-                                  _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
-{
-    *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
-    *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
-    *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
-    *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
-    *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
-    *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
-    *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
-    *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
-    *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
-    *x4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+9);
-    *y4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+10);
-    *z4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+11);
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                      _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                      _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                      _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
+    *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
+    *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
+    *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
+    *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
+    *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
+    *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
+    *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
+    *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
+    *x4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+9);
+    *y4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+10);
+    *z4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+11);
  }
  
  
  static gmx_inline void
  gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
-                                  const double * gmx_restrict ptrB,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
+                                      const double * gmx_restrict ptrB,
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
  {
-    _fjsp_v2r8 t1,t2,t3,t4;
+    _fjsp_v2r8 t1, t2, t3, t4;
      t1           = _fjsp_load_v2r8(ptrA);
      t2           = _fjsp_load_v2r8(ptrB);
-    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
-    t4           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
-    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
+    t4           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
      *x1          = t1;
      *y1          = t2;
-    *z1          = _fjsp_unpacklo_v2r8(t3,t4);
+    *z1          = _fjsp_unpacklo_v2r8(t3, t4);
  }
  
  static gmx_inline void
  gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                      _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                      _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
  {
-_fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
      t1           = _fjsp_load_v2r8(ptrA);
      t2           = _fjsp_load_v2r8(ptrB);
      t3           = _fjsp_load_v2r8(ptrA+2);
@@ -413,12 +413,12 @@ _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
      t6           = _fjsp_load_v2r8(ptrB+4);
      t7           = _fjsp_load_v2r8(ptrA+6);
      t8           = _fjsp_load_v2r8(ptrB+6);
-    t9           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
-    t10          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
-    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
-    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
-    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
-    GMX_FJSP_TRANSPOSE2_V2R8(t7,t8);
+    t9           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
+    t10          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
+    GMX_FJSP_TRANSPOSE2_V2R8(t7, t8);
      *x1          = t1;
      *y1          = t2;
      *z1          = t3;
@@ -427,27 +427,27 @@ _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
      *z2          = t6;
      *x3          = t7;
      *y3          = t8;
-    *z3          = _fjsp_unpacklo_v2r8(t9,t10);
+    *z3          = _fjsp_unpacklo_v2r8(t9, t10);
  }
  
  
  static gmx_inline void
  gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
-                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
-                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
-                                  _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+                                      _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                      _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                      _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                      _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
  {
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
      t1           = _fjsp_load_v2r8(ptrA);
      t2           = _fjsp_load_v2r8(ptrB);
      t3           = _fjsp_load_v2r8(ptrA+2);
      t4           = _fjsp_load_v2r8(ptrB+2);
      t5           = _fjsp_load_v2r8(ptrA+4);
      t6           = _fjsp_load_v2r8(ptrB+4);
-    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
-    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
-    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
      *x1          = t1;
      *y1          = t2;
      *z1          = t3;
@@ -460,9 +460,9 @@ gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const do
      t4           = _fjsp_load_v2r8(ptrB+8);
      t5           = _fjsp_load_v2r8(ptrA+10);
      t6           = _fjsp_load_v2r8(ptrB+10);
-    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
-    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
-    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
      *x3          = t1;
      *y3          = t2;
      *z3          = t3;
@@ -474,250 +474,250 @@ gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const do
  
  static void
  gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
-{
-    _fjsp_v2r8 t1,t2,t3;
-    
-    t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
-    t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
-    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
-    
-    t1           = _fjsp_sub_v2r8(t1,x1);
-    t2           = _fjsp_sub_v2r8(t2,y1);
-    t3           = _fjsp_sub_v2r8(t3,z1);
-    _fjsp_storel_v2r8(ptrA,t1);
-    _fjsp_storel_v2r8(ptrA+1,t2);
-    _fjsp_storel_v2r8(ptrA+2,t3);
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+{
+    _fjsp_v2r8 t1, t2, t3;
+
+    t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
+    t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
+
+    t1           = _fjsp_sub_v2r8(t1, x1);
+    t2           = _fjsp_sub_v2r8(t2, y1);
+    t3           = _fjsp_sub_v2r8(t3, z1);
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storel_v2r8(ptrA+1, t2);
+    _fjsp_storel_v2r8(ptrA+2, t3);
  }
  
  static void
  gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
-                                          _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+                                               _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
  {
-  _fjsp_v2r8 t1,t2,t3;
+    _fjsp_v2r8 t1, t2, t3;
  
-  t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
-  t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
-  t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
+    t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
  
-  t1           = _fjsp_nmsub_v2r8(fscal,dx1,t1);
-  t2           = _fjsp_nmsub_v2r8(fscal,dy1,t2);
-  t3           = _fjsp_nmsub_v2r8(fscal,dz1,t3);
-  _fjsp_storel_v2r8(ptrA,t1);
-  _fjsp_storel_v2r8(ptrA+1,t2);
-  _fjsp_storel_v2r8(ptrA+2,t3);
+    t1           = _fjsp_nmsub_v2r8(fscal, dx1, t1);
+    t2           = _fjsp_nmsub_v2r8(fscal, dy1, t2);
+    t3           = _fjsp_nmsub_v2r8(fscal, dz1, t3);
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storel_v2r8(ptrA+1, t2);
+    _fjsp_storel_v2r8(ptrA+2, t3);
  }
  
  
  static void
  gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
-                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
-                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                           _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                           _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
  {
-    _fjsp_v2r8 t1,t2,t3,t4,t5;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5;
+
      t1          = _fjsp_load_v2r8(ptrA);
      t2          = _fjsp_load_v2r8(ptrA+2);
      t3          = _fjsp_load_v2r8(ptrA+4);
      t4          = _fjsp_load_v2r8(ptrA+6);
-    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
-    
-    x1          = _fjsp_unpacklo_v2r8(x1,y1);
-    z1          = _fjsp_unpacklo_v2r8(z1,x2);
-    y2          = _fjsp_unpacklo_v2r8(y2,z2);
-    x3          = _fjsp_unpacklo_v2r8(x3,y3);
+    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
+
+    x1          = _fjsp_unpacklo_v2r8(x1, y1);
+    z1          = _fjsp_unpacklo_v2r8(z1, x2);
+    y2          = _fjsp_unpacklo_v2r8(y2, z2);
+    x3          = _fjsp_unpacklo_v2r8(x3, y3);
      /* nothing to be done for z3 */
-    
-    t1          = _fjsp_sub_v2r8(t1,x1);
-    t2          = _fjsp_sub_v2r8(t2,z1);
-    t3          = _fjsp_sub_v2r8(t3,y2);
-    t4          = _fjsp_sub_v2r8(t4,x3);
-    t5          = _fjsp_sub_v2r8(t5,z3);
-    _fjsp_storel_v2r8(ptrA,t1);
-    _fjsp_storeh_v2r8(ptrA+1,t1);
-    _fjsp_storel_v2r8(ptrA+2,t2);
-    _fjsp_storeh_v2r8(ptrA+3,t2);
-    _fjsp_storel_v2r8(ptrA+4,t3);
-    _fjsp_storeh_v2r8(ptrA+5,t3);
-    _fjsp_storel_v2r8(ptrA+6,t4);
-    _fjsp_storeh_v2r8(ptrA+7,t4);
-    _fjsp_storel_v2r8(ptrA+8,t5);
+
+    t1          = _fjsp_sub_v2r8(t1, x1);
+    t2          = _fjsp_sub_v2r8(t2, z1);
+    t3          = _fjsp_sub_v2r8(t3, y2);
+    t4          = _fjsp_sub_v2r8(t4, x3);
+    t5          = _fjsp_sub_v2r8(t5, z3);
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storeh_v2r8(ptrA+3, t2);
+    _fjsp_storel_v2r8(ptrA+4, t3);
+    _fjsp_storeh_v2r8(ptrA+5, t3);
+    _fjsp_storel_v2r8(ptrA+6, t4);
+    _fjsp_storeh_v2r8(ptrA+7, t4);
+    _fjsp_storel_v2r8(ptrA+8, t5);
  }
  
  
  static void
  gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
-                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
-                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
-                                       _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                           _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                           _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                           _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
  {
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
+
      t1          = _fjsp_load_v2r8(ptrA);
      t2          = _fjsp_load_v2r8(ptrA+2);
      t3          = _fjsp_load_v2r8(ptrA+4);
      t4          = _fjsp_load_v2r8(ptrA+6);
      t5          = _fjsp_load_v2r8(ptrA+8);
      t6          = _fjsp_load_v2r8(ptrA+10);
-    
-    x1          = _fjsp_unpacklo_v2r8(x1,y1);
-    z1          = _fjsp_unpacklo_v2r8(z1,x2);
-    y2          = _fjsp_unpacklo_v2r8(y2,z2);
-    x3          = _fjsp_unpacklo_v2r8(x3,y3);
-    z3          = _fjsp_unpacklo_v2r8(z3,x4);
-    y4          = _fjsp_unpacklo_v2r8(y4,z4);
-    
-    _fjsp_storel_v2r8(ptrA,    _fjsp_sub_v2r8( t1,x1 ));
-    _fjsp_storeh_v2r8(ptrA+1,  _fjsp_sub_v2r8( t1,x1 ));
-    _fjsp_storel_v2r8(ptrA+2,  _fjsp_sub_v2r8( t2,z1 ));
-    _fjsp_storeh_v2r8(ptrA+3,  _fjsp_sub_v2r8( t2,z1 ));
-    _fjsp_storel_v2r8(ptrA+4,  _fjsp_sub_v2r8( t3,y2 ));
-    _fjsp_storeh_v2r8(ptrA+5,  _fjsp_sub_v2r8( t3,y2 ));
-    _fjsp_storel_v2r8(ptrA+6,  _fjsp_sub_v2r8( t4,x3 ));
-    _fjsp_storeh_v2r8(ptrA+7,  _fjsp_sub_v2r8( t4,x3 ));
-    _fjsp_storel_v2r8(ptrA+8,  _fjsp_sub_v2r8( t5,z3 ));
-    _fjsp_storeh_v2r8(ptrA+9,  _fjsp_sub_v2r8( t5,z3 ));
-    _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6,y4 ));
-    _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6,y4 ));
+
+    x1          = _fjsp_unpacklo_v2r8(x1, y1);
+    z1          = _fjsp_unpacklo_v2r8(z1, x2);
+    y2          = _fjsp_unpacklo_v2r8(y2, z2);
+    x3          = _fjsp_unpacklo_v2r8(x3, y3);
+    z3          = _fjsp_unpacklo_v2r8(z3, x4);
+    y4          = _fjsp_unpacklo_v2r8(y4, z4);
+
+    _fjsp_storel_v2r8(ptrA,    _fjsp_sub_v2r8( t1, x1 ));
+    _fjsp_storeh_v2r8(ptrA+1,  _fjsp_sub_v2r8( t1, x1 ));
+    _fjsp_storel_v2r8(ptrA+2,  _fjsp_sub_v2r8( t2, z1 ));
+    _fjsp_storeh_v2r8(ptrA+3,  _fjsp_sub_v2r8( t2, z1 ));
+    _fjsp_storel_v2r8(ptrA+4,  _fjsp_sub_v2r8( t3, y2 ));
+    _fjsp_storeh_v2r8(ptrA+5,  _fjsp_sub_v2r8( t3, y2 ));
+    _fjsp_storel_v2r8(ptrA+6,  _fjsp_sub_v2r8( t4, x3 ));
+    _fjsp_storeh_v2r8(ptrA+7,  _fjsp_sub_v2r8( t4, x3 ));
+    _fjsp_storel_v2r8(ptrA+8,  _fjsp_sub_v2r8( t5, z3 ));
+    _fjsp_storeh_v2r8(ptrA+9,  _fjsp_sub_v2r8( t5, z3 ));
+    _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6, y4 ));
+    _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6, y4 ));
  }
  
  static void
  gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
-{
-  _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7;
-    
-  t1          = _fjsp_load_v2r8(ptrA);
-  t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
-  t3          = _fjsp_load_v2r8(ptrB);
-  t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
-    
-  t5          = _fjsp_unpacklo_v2r8(x1,y1);
-  t6          = _fjsp_unpackhi_v2r8(x1,y1);
-  t7          = _fjsp_unpackhi_v2r8(z1,z1);
-    
-  t1          = _fjsp_sub_v2r8(t1,t5);
-  t2          = _fjsp_sub_v2r8(t2,z1);
-    
-  t3          = _fjsp_sub_v2r8(t3,t6);
-  t4          = _fjsp_sub_v2r8(t4,t7);
-    
-  _fjsp_storel_v2r8(ptrA,t1);
-  _fjsp_storeh_v2r8(ptrA+1,t1);
-  _fjsp_storel_v2r8(ptrA+2,t2);
-  _fjsp_storel_v2r8(ptrB,t3);
-  _fjsp_storeh_v2r8(ptrB+1,t3);
-  _fjsp_storel_v2r8(ptrB+2,t4);
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+{
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7;
+
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrB);
+    t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
+
+    t5          = _fjsp_unpacklo_v2r8(x1, y1);
+    t6          = _fjsp_unpackhi_v2r8(x1, y1);
+    t7          = _fjsp_unpackhi_v2r8(z1, z1);
+
+    t1          = _fjsp_sub_v2r8(t1, t5);
+    t2          = _fjsp_sub_v2r8(t2, z1);
+
+    t3          = _fjsp_sub_v2r8(t3, t6);
+    t4          = _fjsp_sub_v2r8(t4, t7);
+
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storel_v2r8(ptrB, t3);
+    _fjsp_storeh_v2r8(ptrB+1, t3);
+    _fjsp_storel_v2r8(ptrB+2, t4);
  }
  
  
  static void
  gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                              _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+                                               _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
  {
-  _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,fscalA,fscalB;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, fscalA, fscalB;
+
      t1          = _fjsp_load_v2r8(ptrA);
-    t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
      t3          = _fjsp_load_v2r8(ptrB);
-    t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
-    fscalA      = _fjsp_unpacklo_v2r8(fscal,fscal);
-    fscalB      = _fjsp_unpackhi_v2r8(fscal,fscal);
-    
-    t5          = _fjsp_unpacklo_v2r8(dx1,dy1);
-    t6          = _fjsp_unpackhi_v2r8(dx1,dy1);
-    t7          = _fjsp_unpackhi_v2r8(dz1,dz1);
-    
-    t1          = _fjsp_nmsub_v2r8(fscalA,t5,t1);
-    t2          = _fjsp_nmsub_v2r8(fscalA,dz1,t2);
-    
-    t3          = _fjsp_nmsub_v2r8(fscalB,t6,t3);
-    t4          = _fjsp_nmsub_v2r8(fscalB,t7,t4);
-    
-    _fjsp_storel_v2r8(ptrA,t1);
-    _fjsp_storeh_v2r8(ptrA+1,t1);
-    _fjsp_storel_v2r8(ptrA+2,t2);
-    _fjsp_storel_v2r8(ptrB,t3);
-    _fjsp_storeh_v2r8(ptrB+1,t3);
-    _fjsp_storel_v2r8(ptrB+2,t4);
+    t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
+    fscalA      = _fjsp_unpacklo_v2r8(fscal, fscal);
+    fscalB      = _fjsp_unpackhi_v2r8(fscal, fscal);
+
+    t5          = _fjsp_unpacklo_v2r8(dx1, dy1);
+    t6          = _fjsp_unpackhi_v2r8(dx1, dy1);
+    t7          = _fjsp_unpackhi_v2r8(dz1, dz1);
+
+    t1          = _fjsp_nmsub_v2r8(fscalA, t5, t1);
+    t2          = _fjsp_nmsub_v2r8(fscalA, dz1, t2);
+
+    t3          = _fjsp_nmsub_v2r8(fscalB, t6, t3);
+    t4          = _fjsp_nmsub_v2r8(fscalB, t7, t4);
+
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storel_v2r8(ptrB, t3);
+    _fjsp_storeh_v2r8(ptrB+1, t3);
+    _fjsp_storel_v2r8(ptrB+2, t4);
  }
  
  
  static void
  gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
-                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
-                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                           _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                           _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
  {
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
-    _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI;
-    
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+    _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI;
+
      t1          = _fjsp_load_v2r8(ptrA);
      t2          = _fjsp_load_v2r8(ptrA+2);
      t3          = _fjsp_load_v2r8(ptrA+4);
      t4          = _fjsp_load_v2r8(ptrA+6);
-    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
      t6          = _fjsp_load_v2r8(ptrB);
      t7          = _fjsp_load_v2r8(ptrB+2);
      t8          = _fjsp_load_v2r8(ptrB+4);
      t9          = _fjsp_load_v2r8(ptrB+6);
-    t10         = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
-    
-    tA          = _fjsp_unpacklo_v2r8(x1,y1);
-    tB          = _fjsp_unpackhi_v2r8(x1,y1);
-    tC          = _fjsp_unpacklo_v2r8(z1,x2);
-    tD          = _fjsp_unpackhi_v2r8(z1,x2);
-    tE          = _fjsp_unpacklo_v2r8(y2,z2);
-    tF          = _fjsp_unpackhi_v2r8(y2,z2);
-    tG          = _fjsp_unpacklo_v2r8(x3,y3);
-    tH          = _fjsp_unpackhi_v2r8(x3,y3);
-    tI          = _fjsp_unpackhi_v2r8(z3,z3);
-    
-    t1          = _fjsp_sub_v2r8(t1,tA);
-    t2          = _fjsp_sub_v2r8(t2,tC);
-    t3          = _fjsp_sub_v2r8(t3,tE);
-    t4          = _fjsp_sub_v2r8(t4,tG);
-    t5          = _fjsp_sub_v2r8(t5,z3);
-    
-    t6          = _fjsp_sub_v2r8(t6,tB);
-    t7          = _fjsp_sub_v2r8(t7,tD);
-    t8          = _fjsp_sub_v2r8(t8,tF);
-    t9          = _fjsp_sub_v2r8(t9,tH);
-    t10         = _fjsp_sub_v2r8(t10,tI);
-    
-    _fjsp_storel_v2r8(ptrA,t1);
-    _fjsp_storeh_v2r8(ptrA+1,t1);
-    _fjsp_storel_v2r8(ptrA+2,t2);
-    _fjsp_storeh_v2r8(ptrA+3,t2);
-    _fjsp_storel_v2r8(ptrA+4,t3);
-    _fjsp_storeh_v2r8(ptrA+5,t3);
-    _fjsp_storel_v2r8(ptrA+6,t4);
-    _fjsp_storeh_v2r8(ptrA+7,t4);
-    _fjsp_storel_v2r8(ptrA+8,t5);
-    _fjsp_storel_v2r8(ptrB,t6);
-    _fjsp_storeh_v2r8(ptrB+1,t6);
-    _fjsp_storel_v2r8(ptrB+2,t7);
-    _fjsp_storeh_v2r8(ptrB+3,t7);
-    _fjsp_storel_v2r8(ptrB+4,t8);
-    _fjsp_storeh_v2r8(ptrB+5,t8);
-    _fjsp_storel_v2r8(ptrB+6,t9);
-    _fjsp_storeh_v2r8(ptrB+7,t9);
-    _fjsp_storel_v2r8(ptrB+8,t10);
+    t10         = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
+
+    tA          = _fjsp_unpacklo_v2r8(x1, y1);
+    tB          = _fjsp_unpackhi_v2r8(x1, y1);
+    tC          = _fjsp_unpacklo_v2r8(z1, x2);
+    tD          = _fjsp_unpackhi_v2r8(z1, x2);
+    tE          = _fjsp_unpacklo_v2r8(y2, z2);
+    tF          = _fjsp_unpackhi_v2r8(y2, z2);
+    tG          = _fjsp_unpacklo_v2r8(x3, y3);
+    tH          = _fjsp_unpackhi_v2r8(x3, y3);
+    tI          = _fjsp_unpackhi_v2r8(z3, z3);
+
+    t1          = _fjsp_sub_v2r8(t1, tA);
+    t2          = _fjsp_sub_v2r8(t2, tC);
+    t3          = _fjsp_sub_v2r8(t3, tE);
+    t4          = _fjsp_sub_v2r8(t4, tG);
+    t5          = _fjsp_sub_v2r8(t5, z3);
+
+    t6          = _fjsp_sub_v2r8(t6, tB);
+    t7          = _fjsp_sub_v2r8(t7, tD);
+    t8          = _fjsp_sub_v2r8(t8, tF);
+    t9          = _fjsp_sub_v2r8(t9, tH);
+    t10         = _fjsp_sub_v2r8(t10, tI);
+
+    _fjsp_storel_v2r8(ptrA, t1);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storeh_v2r8(ptrA+3, t2);
+    _fjsp_storel_v2r8(ptrA+4, t3);
+    _fjsp_storeh_v2r8(ptrA+5, t3);
+    _fjsp_storel_v2r8(ptrA+6, t4);
+    _fjsp_storeh_v2r8(ptrA+7, t4);
+    _fjsp_storel_v2r8(ptrA+8, t5);
+    _fjsp_storel_v2r8(ptrB, t6);
+    _fjsp_storeh_v2r8(ptrB+1, t6);
+    _fjsp_storel_v2r8(ptrB+2, t7);
+    _fjsp_storeh_v2r8(ptrB+3, t7);
+    _fjsp_storel_v2r8(ptrB+4, t8);
+    _fjsp_storeh_v2r8(ptrB+5, t8);
+    _fjsp_storel_v2r8(ptrB+6, t9);
+    _fjsp_storeh_v2r8(ptrB+7, t9);
+    _fjsp_storel_v2r8(ptrB+8, t10);
  }
  
  
  static void
  gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
-                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
-                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
-                                       _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
-{
-    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
-    _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
-    
+                                           _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                           _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                           _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                           _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
+{
+    _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
+    _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
+
      t1          = _fjsp_load_v2r8(ptrA);
      t2          = _fjsp_load_v2r8(ptrA+2);
      t3          = _fjsp_load_v2r8(ptrA+4);
@@ -730,112 +730,112 @@ gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double *
      t10         = _fjsp_load_v2r8(ptrB+6);
      t11         = _fjsp_load_v2r8(ptrB+8);
      t12         = _fjsp_load_v2r8(ptrB+10);
-    
-    tA          = _fjsp_unpacklo_v2r8(x1,y1);
-    tB          = _fjsp_unpackhi_v2r8(x1,y1);
-    tC          = _fjsp_unpacklo_v2r8(z1,x2);
-    tD          = _fjsp_unpackhi_v2r8(z1,x2);
-    tE          = _fjsp_unpacklo_v2r8(y2,z2);
-    tF          = _fjsp_unpackhi_v2r8(y2,z2);
-    tG          = _fjsp_unpacklo_v2r8(x3,y3);
-    tH          = _fjsp_unpackhi_v2r8(x3,y3);
-    tI          = _fjsp_unpacklo_v2r8(z3,x4);
-    tJ          = _fjsp_unpackhi_v2r8(z3,x4);
-    tK          = _fjsp_unpacklo_v2r8(y4,z4);
-    tL          = _fjsp_unpackhi_v2r8(y4,z4);
-    
-    t1          = _fjsp_sub_v2r8(t1,tA);
-    t2          = _fjsp_sub_v2r8(t2,tC);
-    t3          = _fjsp_sub_v2r8(t3,tE);
-    t4          = _fjsp_sub_v2r8(t4,tG);
-    t5          = _fjsp_sub_v2r8(t5,tI);
-    t6          = _fjsp_sub_v2r8(t6,tK);
-    
-    t7          = _fjsp_sub_v2r8(t7,tB);
-    t8          = _fjsp_sub_v2r8(t8,tD);
-    t9          = _fjsp_sub_v2r8(t9,tF);
-    t10         = _fjsp_sub_v2r8(t10,tH);
-    t11         = _fjsp_sub_v2r8(t11,tJ);
-    t12         = _fjsp_sub_v2r8(t12,tL);
-    
+
+    tA          = _fjsp_unpacklo_v2r8(x1, y1);
+    tB          = _fjsp_unpackhi_v2r8(x1, y1);
+    tC          = _fjsp_unpacklo_v2r8(z1, x2);
+    tD          = _fjsp_unpackhi_v2r8(z1, x2);
+    tE          = _fjsp_unpacklo_v2r8(y2, z2);
+    tF          = _fjsp_unpackhi_v2r8(y2, z2);
+    tG          = _fjsp_unpacklo_v2r8(x3, y3);
+    tH          = _fjsp_unpackhi_v2r8(x3, y3);
+    tI          = _fjsp_unpacklo_v2r8(z3, x4);
+    tJ          = _fjsp_unpackhi_v2r8(z3, x4);
+    tK          = _fjsp_unpacklo_v2r8(y4, z4);
+    tL          = _fjsp_unpackhi_v2r8(y4, z4);
+
+    t1          = _fjsp_sub_v2r8(t1, tA);
+    t2          = _fjsp_sub_v2r8(t2, tC);
+    t3          = _fjsp_sub_v2r8(t3, tE);
+    t4          = _fjsp_sub_v2r8(t4, tG);
+    t5          = _fjsp_sub_v2r8(t5, tI);
+    t6          = _fjsp_sub_v2r8(t6, tK);
+
+    t7          = _fjsp_sub_v2r8(t7, tB);
+    t8          = _fjsp_sub_v2r8(t8, tD);
+    t9          = _fjsp_sub_v2r8(t9, tF);
+    t10         = _fjsp_sub_v2r8(t10, tH);
+    t11         = _fjsp_sub_v2r8(t11, tJ);
+    t12         = _fjsp_sub_v2r8(t12, tL);
+
      _fjsp_storel_v2r8(ptrA,  t1);
-    _fjsp_storeh_v2r8(ptrA+1,t1);
-    _fjsp_storel_v2r8(ptrA+2,t2);
-    _fjsp_storeh_v2r8(ptrA+3,t2);
-    _fjsp_storel_v2r8(ptrA+4,t3);
-    _fjsp_storeh_v2r8(ptrA+5,t3);
-    _fjsp_storel_v2r8(ptrA+6,t4);
-    _fjsp_storeh_v2r8(ptrA+7,t4);
-    _fjsp_storel_v2r8(ptrA+8,t5);
-    _fjsp_storeh_v2r8(ptrA+9,t5);
-    _fjsp_storel_v2r8(ptrA+10,t6);
-    _fjsp_storeh_v2r8(ptrA+11,t6);
+    _fjsp_storeh_v2r8(ptrA+1, t1);
+    _fjsp_storel_v2r8(ptrA+2, t2);
+    _fjsp_storeh_v2r8(ptrA+3, t2);
+    _fjsp_storel_v2r8(ptrA+4, t3);
+    _fjsp_storeh_v2r8(ptrA+5, t3);
+    _fjsp_storel_v2r8(ptrA+6, t4);
+    _fjsp_storeh_v2r8(ptrA+7, t4);
+    _fjsp_storel_v2r8(ptrA+8, t5);
+    _fjsp_storeh_v2r8(ptrA+9, t5);
+    _fjsp_storel_v2r8(ptrA+10, t6);
+    _fjsp_storeh_v2r8(ptrA+11, t6);
      _fjsp_storel_v2r8(ptrB,  t7);
-    _fjsp_storeh_v2r8(ptrB+1,t7);
-    _fjsp_storel_v2r8(ptrB+2,t8);
-    _fjsp_storeh_v2r8(ptrB+3,t8);
-    _fjsp_storel_v2r8(ptrB+4,t9);
-    _fjsp_storeh_v2r8(ptrB+5,t9);
-    _fjsp_storel_v2r8(ptrB+6,t10);
-    _fjsp_storeh_v2r8(ptrB+7,t10);
-    _fjsp_storel_v2r8(ptrB+8,t11);
-    _fjsp_storeh_v2r8(ptrB+9,t11);
-    _fjsp_storel_v2r8(ptrB+10,t12);
-    _fjsp_storeh_v2r8(ptrB+11,t12);
+    _fjsp_storeh_v2r8(ptrB+1, t7);
+    _fjsp_storel_v2r8(ptrB+2, t8);
+    _fjsp_storeh_v2r8(ptrB+3, t8);
+    _fjsp_storel_v2r8(ptrB+4, t9);
+    _fjsp_storeh_v2r8(ptrB+5, t9);
+    _fjsp_storel_v2r8(ptrB+6, t10);
+    _fjsp_storeh_v2r8(ptrB+7, t10);
+    _fjsp_storel_v2r8(ptrB+8, t11);
+    _fjsp_storeh_v2r8(ptrB+9, t11);
+    _fjsp_storel_v2r8(ptrB+10, t12);
+    _fjsp_storeh_v2r8(ptrB+11, t12);
  }
  
  
  
  static gmx_inline void
  gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
-                                      double * gmx_restrict fptr,
-                                      double * gmx_restrict fshiftptr)
+                                          double * gmx_restrict fptr,
+                                          double * gmx_restrict fshiftptr)
  {
-    __m128d t1,t2,t3,t4;
-    
+    __m128d t1, t2, t3, t4;
+
      /* transpose data */
-    t1 = fix1;
-    fix1 = _fjsp_unpacklo_v2r8(fix1,fiy1); /* y0 x0 */
-    fiy1 = _fjsp_unpackhi_v2r8(t1,fiy1);   /* y1 x1 */
-    
-    fix1 = _fjsp_add_v2r8(fix1,fiy1);
-    fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1,fiz1 ));
-    
+    t1   = fix1;
+    fix1 = _fjsp_unpacklo_v2r8(fix1, fiy1); /* y0 x0 */
+    fiy1 = _fjsp_unpackhi_v2r8(t1, fiy1);   /* y1 x1 */
+
+    fix1 = _fjsp_add_v2r8(fix1, fiy1);
+    fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1, fiz1 ));
+
      t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
      _fjsp_storel_v2r8( fptr, t4 );
      _fjsp_storeh_v2r8( fptr+1, t4 );
-    _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+2), fiz1 ));
-    
+    _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+2), fiz1 ));
+
      t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
      _fjsp_storel_v2r8( fshiftptr, t4 );
      _fjsp_storeh_v2r8( fshiftptr+1, t4 );
-    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
  }
  
  static gmx_inline void
  gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
-                                      _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
-                                      _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
-                                      double * gmx_restrict fptr,
-                                      double * gmx_restrict fshiftptr)
+                                          _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                          _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                          double * gmx_restrict fptr,
+                                          double * gmx_restrict fshiftptr)
  {
-    __m128d t1,t2,t3,t4,t5,t6;
-    
+    __m128d t1, t2, t3, t4, t5, t6;
+
      /* transpose data */
-    GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
-    t1 = fix3;
-    fix3 = _fjsp_unpacklo_v2r8(fix3,fiy3); /* y0 x0 */
-    fiy3 = _fjsp_unpackhi_v2r8(t1,fiy3);   /* y1 x1 */
-    
-    fix1 = _fjsp_add_v2r8(fix1,fiy1);
-    fiz1 = _fjsp_add_v2r8(fiz1,fix2);
-    fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
-    
-    fix3 = _fjsp_add_v2r8(fix3,fiy3);
-    fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3,fiz3));
-    
+    GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
+    t1   = fix3;
+    fix3 = _fjsp_unpacklo_v2r8(fix3, fiy3); /* y0 x0 */
+    fiy3 = _fjsp_unpackhi_v2r8(t1, fiy3);   /* y1 x1 */
+
+    fix1 = _fjsp_add_v2r8(fix1, fiy1);
+    fiz1 = _fjsp_add_v2r8(fiz1, fix2);
+    fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
+
+    fix3 = _fjsp_add_v2r8(fix3, fiy3);
+    fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3, fiz3));
+
      t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
      t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
      t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
@@ -849,48 +849,48 @@ gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjs
      _fjsp_storeh_v2r8( fptr+5, t5 );
      _fjsp_storel_v2r8( fptr+6, t6 );
      _fjsp_storeh_v2r8( fptr+7, t6 );
-    _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+8), fiz3 ));
-    
-    fix1 = _fjsp_add_v2r8(fix1,fix3);
-    t1   = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
-    fix1 = _fjsp_add_v2r8(fix1,t1); /* x and y sums */
-    
-    t2   = _fjsp_shuffle_v2r8(fiy2,fiy2,GMX_FJSP_SHUFFLE2(1,1));
-    fiz1 = _fjsp_add_v2r8(fiz1,fiz3);
-    fiz1 = _fjsp_add_v2r8(fiz1,t2); /* z sum */
-    
+    _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+8), fiz3 ));
+
+    fix1 = _fjsp_add_v2r8(fix1, fix3);
+    t1   = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
+    fix1 = _fjsp_add_v2r8(fix1, t1); /* x and y sums */
+
+    t2   = _fjsp_shuffle_v2r8(fiy2, fiy2, GMX_FJSP_SHUFFLE2(1, 1));
+    fiz1 = _fjsp_add_v2r8(fiz1, fiz3);
+    fiz1 = _fjsp_add_v2r8(fiz1, t2); /* z sum */
+
      t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
      _fjsp_storel_v2r8( fshiftptr, t3 );
      _fjsp_storeh_v2r8( fshiftptr+1, t3 );
-    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
  }
  
  
  static gmx_inline void
  gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
-                                      _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
-                                      _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
-                                      _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
-                                      double * gmx_restrict fptr,
-                                      double * gmx_restrict fshiftptr)
-{
-    __m128d t1,t2,t3,t4,t5,t6,t7,t8;
-    
+                                          _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                          _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                          _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
+                                          double * gmx_restrict fptr,
+                                          double * gmx_restrict fshiftptr)
+{
+    __m128d t1, t2, t3, t4, t5, t6, t7, t8;
+
      /* transpose data */
-    GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
-    GMX_FJSP_TRANSPOSE2_V2R8(fix3,fiy3);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiz3,fix4);
-    GMX_FJSP_TRANSPOSE2_V2R8(fiy4,fiz4);
-    
-    fix1 = _fjsp_add_v2r8(fix1,fiy1);
-    fiz1 = _fjsp_add_v2r8(fiz1,fix2);
-    fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
-    fix3 = _fjsp_add_v2r8(fix3,fiy3);
-    fiz3 = _fjsp_add_v2r8(fiz3,fix4);
-    fiy4 = _fjsp_add_v2r8(fiy4,fiz4);
-    
+    GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fix3, fiy3);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz3, fix4);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy4, fiz4);
+
+    fix1 = _fjsp_add_v2r8(fix1, fiy1);
+    fiz1 = _fjsp_add_v2r8(fiz1, fix2);
+    fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
+    fix3 = _fjsp_add_v2r8(fix3, fiy3);
+    fiz3 = _fjsp_add_v2r8(fiz3, fix4);
+    fiy4 = _fjsp_add_v2r8(fiy4, fiz4);
+
      t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr),    fix1 );
      t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2),  fiz1 );
      t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4),  fiy2 );
@@ -910,20 +910,20 @@ gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjs
      _fjsp_storel_v2r8( fptr+10, t8 );
      _fjsp_storeh_v2r8( fptr+11, t8 );
  
-    t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
-    fix1 = _fjsp_add_v2r8(fix1,t1);
-    t2 = _fjsp_shuffle_v2r8(fiz3,fiy4,GMX_FJSP_SHUFFLE2(0,1));
-    fix3 = _fjsp_add_v2r8(fix3,t2);
-    fix1 = _fjsp_add_v2r8(fix1,fix3); /* x and y sums */
-    
-    fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2,fiy2));
-    fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4,fiy4));
-    fiz1 = _fjsp_add_v2r8(fiz1,fiz3); /* z sum */
-    
+    t1   = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
+    fix1 = _fjsp_add_v2r8(fix1, t1);
+    t2   = _fjsp_shuffle_v2r8(fiz3, fiy4, GMX_FJSP_SHUFFLE2(0, 1));
+    fix3 = _fjsp_add_v2r8(fix3, t2);
+    fix1 = _fjsp_add_v2r8(fix1, fix3); /* x and y sums */
+
+    fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2, fiy2));
+    fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4, fiy4));
+    fiz1 = _fjsp_add_v2r8(fiz1, fiz3); /* z sum */
+
      t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
      _fjsp_storel_v2r8( fshiftptr, t3 );
      _fjsp_storeh_v2r8( fshiftptr+1, t3 );
-    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
  }
  
  
@@ -931,20 +931,20 @@ gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjs
  static gmx_inline void
  gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
  {
-    pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1,pot1));
-    _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
+    pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1, pot1));
+    _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
  }
  
  static gmx_inline void
  gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
-                      _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
-{
-    GMX_FJSP_TRANSPOSE2_V2R8(pot1,pot2);
-    pot1 = _fjsp_add_v2r8(pot1,pot2);
-    pot2 = _fjsp_unpackhi_v2r8(pot1,pot1);
-    
-    _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
-    _fjsp_storel_v2r8(ptrB,_fjsp_add_v2r8(pot2,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB)));
+                          _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
+{
+    GMX_FJSP_TRANSPOSE2_V2R8(pot1, pot2);
+    pot1 = _fjsp_add_v2r8(pot1, pot2);
+    pot2 = _fjsp_unpackhi_v2r8(pot1, pot1);
+
+    _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
+    _fjsp_storel_v2r8(ptrB, _fjsp_add_v2r8(pot2, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB)));
  }
  
  
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py

index e8dbdebe7652aaa8480c9c8497280d4853572a0b..72deaf013d729cb241c5c135bfb26f7b409a3f82 100755 (executable)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
@@ -495,13 +495,13 @@ pass
  # Write out the list of settings and corresponding kernels to the declaration file
  fpdecl.write( '\n\n' )
  fpdecl.write( 'nb_kernel_info_t\n' )
-fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( '    kernellist_'+Arch+'[] =\n' )
  fpdecl.write( '{\n' )
  for decl in kerneldecl[0:-1]:
      fpdecl.write( decl + ',\n' )
  fpdecl.write( kerneldecl[-1] + '\n' )
  fpdecl.write( '};\n\n' )
  fpdecl.write( 'int\n' )
-fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( '    kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
  fpdecl.write( '#endif\n')
  fpdecl.close()
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c

index 4471c61789359e1578204b0440b4c3a4e867dff0..d603feb1b87d3d1db134eb15362e59da3425b8f5 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
@@ -257,7 +257,7 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
  
  
  nb_kernel_info_t
-kernellist_sparc64_hpc_ace_double[] =
+    kernellist_sparc64_hpc_ace_double[] =
  {
      { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
      { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
@@ -476,6 +476,6 @@ kernellist_sparc64_hpc_ace_double[] =
  };
  
  int
-kernellist_sparc64_hpc_ace_double_size = sizeof(kernellist_sparc64_hpc_ace_double)/sizeof(kernellist_sparc64_hpc_ace_double[0]);
+    kernellist_sparc64_hpc_ace_double_size = sizeof(kernellist_sparc64_hpc_ace_double)/sizeof(kernellist_sparc64_hpc_ace_double[0]);
  
  #endif
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h

index 3260a35c6b9c69af3825234ccb40408ded3a6693..67ace2f0e7400de7daad22475f1af9ac209bb992 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h
@@ -40,10 +40,10 @@
  
  /* List of kernels for this architecture with metadata about them */
  extern nb_kernel_info_t
-kernellist_sparc64_hpc_ace_double[];
+    kernellist_sparc64_hpc_ace_double[];
  
  /* Length of kernellist_c */
  extern int
-kernellist_sparc64_hpc_ace_double_size;
+    kernellist_sparc64_hpc_ace_double_size;
  
  #endif
diff --git a/src/gromacs/gmxpreprocess/gen_vsite.c b/src/gromacs/gmxpreprocess/gen_vsite.c

index 106f23877e4db0c13f9905149734d15171ddacd1..f3ad7fa4e4c6b8ed92bf9d3af1c13c920c1cadbe 100644 (file)
--- a/src/gromacs/gmxpreprocess/gen_vsite.c
+++ b/src/gromacs/gmxpreprocess/gen_vsite.c
@@ -1076,16 +1076,16 @@ static int gen_vsites_trp(gpp_atomtype_t atype, rvec *newx[],
      for (j = 0; j < NMASS; j++)
      {
          sprintf(name, "MW%d", j+1);
-        (*newatomname)  [atM[j]]        = put_symtab(symtab, name);
-        (*newatom)      [atM[j]].m      = (*newatom)[atM[j]].mB    = mM[j];
-        (*newatom)      [atM[j]].q      = (*newatom)[atM[j]].qB    = 0.0;
-        (*newatom)      [atM[j]].type   = (*newatom)[atM[j]].typeB = tpM;
-        (*newatom)      [atM[j]].ptype  = eptAtom;
-        (*newatom)      [atM[j]].resind = at->atom[i0].resind;
+        (*newatomname)  [atM[j]]         = put_symtab(symtab, name);
+        (*newatom)      [atM[j]].m       = (*newatom)[atM[j]].mB    = mM[j];
+        (*newatom)      [atM[j]].q       = (*newatom)[atM[j]].qB    = 0.0;
+        (*newatom)      [atM[j]].type    = (*newatom)[atM[j]].typeB = tpM;
+        (*newatom)      [atM[j]].ptype   = eptAtom;
+        (*newatom)      [atM[j]].resind  = at->atom[i0].resind;
          (*newatom)      [atM[j]].elem[0] = 'M';
          (*newatom)      [atM[j]].elem[1] = '\0';
-        (*newvsite_type)[atM[j]]        = NOTSET;
-        (*newcgnr)      [atM[j]]        = (*cgnr)[i0];
+        (*newvsite_type)[atM[j]]         = NOTSET;
+        (*newcgnr)      [atM[j]]         = (*cgnr)[i0];
      }
      /* renumber cgnr: */
      for (i = i0; i < at->nr; i++)
@@ -1260,16 +1260,16 @@ static int gen_vsites_tyr(gpp_atomtype_t atype, rvec *newx[],
      rvec_add(r1, x[ats[atHH]], (*newx)[atM]);
  
      strcpy(name, "MW1");
-    (*newatomname)  [atM]        = put_symtab(symtab, name);
-    (*newatom)      [atM].m      = (*newatom)[atM].mB    = mM;
-    (*newatom)      [atM].q      = (*newatom)[atM].qB    = 0.0;
-    (*newatom)      [atM].type   = (*newatom)[atM].typeB = tpM;
-    (*newatom)      [atM].ptype  = eptAtom;
-    (*newatom)      [atM].resind = at->atom[i0].resind;
+    (*newatomname)  [atM]         = put_symtab(symtab, name);
+    (*newatom)      [atM].m       = (*newatom)[atM].mB    = mM;
+    (*newatom)      [atM].q       = (*newatom)[atM].qB    = 0.0;
+    (*newatom)      [atM].type    = (*newatom)[atM].typeB = tpM;
+    (*newatom)      [atM].ptype   = eptAtom;
+    (*newatom)      [atM].resind  = at->atom[i0].resind;
      (*newatom)      [atM].elem[0] = 'M';
      (*newatom)      [atM].elem[1] = '\0';
-    (*newvsite_type)[atM]        = NOTSET;
-    (*newcgnr)      [atM]        = (*cgnr)[i0];
+    (*newvsite_type)[atM]         = NOTSET;
+    (*newcgnr)      [atM]         = (*cgnr)[i0];
      /* renumber cgnr: */
      for (i = i0; i < at->nr; i++)
      {
@@ -1988,18 +1988,18 @@ void do_vsites(int nrtp, t_restp rtp[], gpp_atomtype_t atype,
                          {
                              name[k+1] = (*at->atomname[Heavy])[k];
                          }
-                        name[k+1]             = atomnamesuffix[j];
-                        name[k+2]             = '\0';
-                        newatomname[ni0+j]    = put_symtab(symtab, name);
-                        newatom[ni0+j].m      = newatom[ni0+j].mB    = mtot/NMASS;
-                        newatom[ni0+j].q      = newatom[ni0+j].qB    = 0.0;
-                        newatom[ni0+j].type   = newatom[ni0+j].typeB = tpM;
-                        newatom[ni0+j].ptype  = eptAtom;
-                        newatom[ni0+j].resind = at->atom[i0].resind;
+                        name[k+1]              = atomnamesuffix[j];
+                        name[k+2]              = '\0';
+                        newatomname[ni0+j]     = put_symtab(symtab, name);
+                        newatom[ni0+j].m       = newatom[ni0+j].mB    = mtot/NMASS;
+                        newatom[ni0+j].q       = newatom[ni0+j].qB    = 0.0;
+                        newatom[ni0+j].type    = newatom[ni0+j].typeB = tpM;
+                        newatom[ni0+j].ptype   = eptAtom;
+                        newatom[ni0+j].resind  = at->atom[i0].resind;
                          newatom[ni0+j].elem[0] = 'M';
                          newatom[ni0+j].elem[1] = '\0';
-                        newvsite_type[ni0+j]  = NOTSET;
-                        newcgnr[ni0+j]        = (*cgnr)[i0];
+                        newvsite_type[ni0+j]   = NOTSET;
+                        newcgnr[ni0+j]         = (*cgnr)[i0];
                      }
                      /* add constraints between dummy masses and to heavies[0] */
                      /* 'add_shift' says which atoms won't be renumbered afterwards */
diff --git a/src/gromacs/gmxpreprocess/readpull.c b/src/gromacs/gmxpreprocess/readpull.c

index b4ba14531295a4bdf7aa1a7ce1540a32bec5ff91..ce9fb64b2f740aee8e38e0f0369a225a959320c2 100644 (file)
--- a/src/gromacs/gmxpreprocess/readpull.c
+++ b/src/gromacs/gmxpreprocess/readpull.c
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014 by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -72,7 +72,7 @@ static void string2dvec(const char buf[], dvec nums)
  }
  
  static void init_pull_group(t_pull_group *pg,
-                            const char *wbuf)
+                            const char   *wbuf)
  {
      double d;
      int    n, m;
@@ -98,7 +98,7 @@ static void init_pull_coord(t_pull_coord *pcrd, int eGeom,
      string2dvec(origin_buf, origin);
      if (pcrd->group[0] != 0 && dnorm(origin) > 0)
      {
-        gmx_fatal(FARGS,"The pull origin can only be set with an absolute reference");
+        gmx_fatal(FARGS, "The pull origin can only be set with an absolute reference");
      }
  
      if (eGeom == epullgDIST)
@@ -235,7 +235,7 @@ char **read_pullparams(int *ninp_p, t_inpfile **inp_p,
      return grpbuf;
  }
  
-void make_pull_groups(t_pull *pull, 
+void make_pull_groups(t_pull *pull,
                        char **pgnames,
                        const t_blocka *grps, char **gnames)
  {
@@ -413,14 +413,14 @@ void set_pull_init(t_inputrec *ir, gmx_mtop_t *mtop, rvec *x, matrix box, real l
      {
          pcrd  = &pull->coord[c];
  
-        pgrp0 = &pull->group[pcrd->group[0]]; 
-        pgrp1 = &pull->group[pcrd->group[1]]; 
+        pgrp0 = &pull->group[pcrd->group[0]];
+        pgrp1 = &pull->group[pcrd->group[1]];
          fprintf(stderr, "%8d  %8d  %8d\n",
                  pcrd->group[0], pgrp0->nat, pgrp0->pbcatom+1);
          fprintf(stderr, "%8d  %8d  %8d ",
                  pcrd->group[1], pgrp1->nat, pgrp1->pbcatom+1);
  
-        init = pcrd->init;
+        init       = pcrd->init;
          pcrd->init = 0;
  
          if (pcrd->rate == 0)
diff --git a/src/gromacs/gmxpreprocess/toppush.c b/src/gromacs/gmxpreprocess/toppush.c

index 88a16f6d269d7b2526dc892499d767919011055d..6f2badc2e5b09b0b57d034b9ad314663639a1f88 100644 (file)
--- a/src/gromacs/gmxpreprocess/toppush.c
+++ b/src/gromacs/gmxpreprocess/toppush.c
@@ -2514,14 +2514,14 @@ int add_atomtype_decoupled(t_symtab *symtab, gpp_atomtype_t at,
  static void convert_pairs_to_pairsQ(t_params *plist,
                                      real fudgeQQ, t_atoms *atoms)
  {
-    t_param *paramp1,*paramp2,*paramnew;
-    int      i,j,p1nr,p2nr,p2newnr;
+    t_param *paramp1, *paramp2, *paramnew;
+    int      i, j, p1nr, p2nr, p2newnr;
  
      /* Add the pair list to the pairQ list */
-    p1nr = plist[F_LJ14].nr;
-    p2nr = plist[F_LJC14_Q].nr;
+    p1nr    = plist[F_LJ14].nr;
+    p2nr    = plist[F_LJC14_Q].nr;
      p2newnr = p1nr + p2nr;
-    snew(paramnew,p2newnr);
+    snew(paramnew, p2newnr);
  
      paramp1             = plist[F_LJ14].param;
      paramp2             = plist[F_LJC14_Q].param;
@@ -2530,18 +2530,18 @@ static void convert_pairs_to_pairsQ(t_params *plist,
         it may be possible to just ADD the converted F_LJ14 array
         to the old F_LJC14_Q array, but since we have to create
         a new sized memory structure, better just to deep copy it all.
-    */
+     */
  
      for (i = 0; i < p2nr; i++)
      {
          /* Copy over parameters */
-        for (j=0;j<5;j++) /* entries are 0-4 for F_LJC14_Q */
+        for (j = 0; j < 5; j++) /* entries are 0-4 for F_LJC14_Q */
          {
              paramnew[i].c[j] = paramp2[i].c[j];
          }
  
          /* copy over atoms */
-        for (j=0;j<2;j++)
+        for (j = 0; j < 2; j++)
          {
              paramnew[i].a[j] = paramp2[i].a[j];
          }
diff --git a/src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h b/src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h

index 403634cd5b4dd8e3a1f5c3690cbd5896e6e028d9..122dff53278d78ecc2636c9f93893b50001393fa 100644 (file)
--- a/src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
+++ b/src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
@@ -113,7 +113,7 @@ typedef struct tMPI_Spinlock
  #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;" : : : "memory")
  #else
  /* MIC is in-order and does not need nor support sfense */
-#define tMPI_Atomic_memory_barrier() __asm__ __volatile__("":::"memory")
+#define tMPI_Atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
  #endif
  
  #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
diff --git a/src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h b/src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h

index 3b4f985db3e34f0f2186bb24016332814c5d618d..19f7bb6a490cfbea278358f11559e48e1c282308 100644 (file)
--- a/src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h
+++ b/src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h
@@ -141,7 +141,7 @@ static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
  static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a, void* oldval,
                                        void* newval)
  {
-    int ret;
+    int                     ret;
      volatile char* volatile oldv = (char*)oldval;
      volatile char* volatile newv = (char*)newval;
  
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 5a3a8fd755b9f90ab36c41136e355f3f5d87a45e..d8be3de2e251560b57e81537cfd6351951a2bd2b 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -42,7 +42,7 @@
  
  #include <cuda.h>
  
-#include "types/simple.h" 
+#include "types/simple.h"
  #include "types/nbnxn_pairlist.h"
  #include "types/nb_verlet.h"
  #include "types/ishift.h"
@@ -153,7 +153,7 @@ static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
   *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
   */
  static const nbnxn_cu_kfunc_ptr_t
-nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+    nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
  {
      { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
        { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
@@ -202,7 +202,7 @@ static inline int calc_shmem_required()
      return shmem;
  }
  
-/*! As we execute nonbonded workload in separate streams, before launching 
+/*! As we execute nonbonded workload in separate streams, before launching
     the kernel we need to make sure that he following operations have completed:
     - atomdata allocation and related H2D transfers (every nstlist step);
     - pair list H2D transfer (every nstlist step);
@@ -217,28 +217,28 @@ static inline int calc_shmem_required()
     However, for the sake of having a future-proof implementation, we use the
     misc_ops_done event to record the point in time when the above  operations
     are finished and synchronize with this event in the non-local stream.
-*/
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
+ */
+void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
                                const nbnxn_atomdata_t *nbatom,
-                              int flags,
-                              int iloc)
+                              int                     flags,
+                              int                     iloc)
  {
-    cudaError_t stat;
-    int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
+    cudaError_t          stat;
+    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
      /* CUDA kernel launch-related stuff */
-    int  shmem, nblock;
-    dim3 dim_block, dim_grid;
+    int                  shmem, nblock;
+    dim3                 dim_block, dim_grid;
      nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
  
-    cu_atomdata_t   *adat   = cu_nb->atdat;
-    cu_nbparam_t    *nbp    = cu_nb->nbparam;
-    cu_plist_t      *plist  = cu_nb->plist[iloc];
-    cu_timers_t     *t      = cu_nb->timers;
-    cudaStream_t    stream  = cu_nb->stream[iloc];
+    cu_atomdata_t       *adat    = cu_nb->atdat;
+    cu_nbparam_t        *nbp     = cu_nb->nbparam;
+    cu_plist_t          *plist   = cu_nb->plist[iloc];
+    cu_timers_t         *t       = cu_nb->timers;
+    cudaStream_t         stream  = cu_nb->stream[iloc];
  
-    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
-    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
-    bool bDoTime     = cu_nb->bDoTime;
+    bool                 bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                 bDoTime     = cu_nb->bDoTime;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
      bCalcEner = (bCalcEner || always_ener) && !never_ener;
@@ -286,7 +286,7 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
  
      /* HtoD x, q */
      cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
-                      adat_len * sizeof(*adat->xq), stream); 
+                      adat_len * sizeof(*adat->xq), stream);
  
      if (bDoTime)
      {
@@ -320,7 +320,7 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
                  NCL_PER_SUPERCL, plist->na_c);
      }
  
-    nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
+    nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
      CU_LAUNCH_ERR("k_calc_nb");
  
      if (bDoTime)
@@ -330,14 +330,14 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
      }
  }
  
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
                                 const nbnxn_atomdata_t *nbatom,
-                               int flags,
-                               int aloc)
+                               int                     flags,
+                               int                     aloc)
  {
      cudaError_t stat;
-    int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
-    int iloc = -1;
+    int         adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
+    int         iloc = -1;
  
      /* determine interaction locality from atom locality */
      if (LOCAL_A(aloc))
@@ -356,13 +356,13 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
          gmx_incons(stmp);
      }
  
-    cu_atomdata_t   *adat   = cu_nb->atdat;
-    cu_timers_t     *t      = cu_nb->timers;
-    bool            bDoTime = cu_nb->bDoTime;
-    cudaStream_t    stream  = cu_nb->stream[iloc];
+    cu_atomdata_t   *adat    = cu_nb->atdat;
+    cu_timers_t     *t       = cu_nb->timers;
+    bool             bDoTime = cu_nb->bDoTime;
+    cudaStream_t     stream  = cu_nb->stream[iloc];
  
-    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
-    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool             bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* don't launch copy-back if there was no work to do */
      if (cu_nb->plist[iloc]->nsci == 0)
@@ -419,7 +419,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
          *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
      }
  
-    /* With DD the local D2H transfer can only start after the non-local 
+    /* With DD the local D2H transfer can only start after the non-local
         has been launched. */
      if (iloc == eintLocal && cu_nb->bUseTwoStreams)
      {
@@ -428,7 +428,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
      }
  
      /* DtoH f */
-    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin, 
+    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
                        (adat_len)*sizeof(*adat->f), stream);
  
      /* After the non-local D2H is launched the nonlocal_done event can be
@@ -472,8 +472,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
   * polling wait for the GPU.
   */
  static inline bool atomic_cas(volatile unsigned int *ptr,
-                              unsigned int oldval,
-                              unsigned int newval)
+                              unsigned int           oldval,
+                              unsigned int           newval)
  {
      assert(ptr);
  
@@ -491,8 +491,8 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
                           real *e_lj, real *e_el, rvec *fshift)
  {
      /* NOTE:  only implemented for single-precision at this time */
-    cudaError_t stat;
-    int i, adat_end, iloc = -1;
+    cudaError_t            stat;
+    int                    i, adat_end, iloc = -1;
      volatile unsigned int *poll_word;
  
      /* determine interaction locality from atom locality */
@@ -512,16 +512,16 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
          gmx_incons(stmp);
      }
  
-    cu_plist_t      *plist   = cu_nb->plist[iloc];
-    cu_timers_t     *timers  = cu_nb->timers;
-    wallclock_gpu_t *timings = cu_nb->timings;
-    nb_staging      nbst     = cu_nb->nbst;
+    cu_plist_t      *plist    = cu_nb->plist[iloc];
+    cu_timers_t     *timers   = cu_nb->timers;
+    wallclock_gpu_t *timings  = cu_nb->timings;
+    nb_staging       nbst     = cu_nb->nbst;
  
-    bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
-    bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool             bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
-    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
  
      /* don't launch wait/update timers & counters if there was no work to do
  
@@ -548,7 +548,7 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
          stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
          CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
      }
-    else 
+    else
      {
          /* Busy-wait until we get the signal pattern set in last byte
           * of the l/nl float vector. This pattern corresponds to a floating
@@ -557,7 +557,9 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
           * The polling uses atomic compare-exchange.
           */
          poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
-        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
+        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern))
+        {
+        }
      }
  
      /* timing data accumulation */
@@ -576,9 +578,9 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
  
          /* X/q H2D and F D2H timings */
          timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
-                                                 timers->stop_nb_h2d[iloc]);
+                                              timers->stop_nb_h2d[iloc]);
          timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
-                                                 timers->stop_nb_d2h[iloc]);
+                                              timers->stop_nb_d2h[iloc]);
  
          /* only count atdat and pair-list H2D at pair-search step */
          if (plist->bDoPrune)
@@ -588,11 +590,11 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
              {
                  timings->pl_h2d_c++;
                  timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
-                                                         timers->stop_atdat);
+                                                      timers->stop_atdat);
              }
  
              timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
-                                                     timers->stop_pl_h2d[iloc]);
+                                                  timers->stop_pl_h2d[iloc]);
          }
      }
  
@@ -621,13 +623,13 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
  }
  
  /*! Return the reference to the nbfp texture. */
-const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
  {
      return nbfp_texref;
  }
  
  /*! Return the reference to the coulomb_tab. */
-const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
  {
      return coulomb_tab_texref;
  }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index 5de2e80988a77087e62a1697bb998104481567a4..fd75a3d1f0cc08c6bc4cf2015e65bfa2583cea47 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -70,8 +70,8 @@ static unsigned int gpu_min_ci_balanced_factor = 40;
  
  /* Functions from nbnxn_cuda.cu */
  extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
-extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
-extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
  
  /* We should actually be using md_print_warn in md_logging.c,
   * but we can't include mpi.h in CUDA code.
@@ -113,9 +113,9 @@ static void init_ewald_coulomb_force_table(cu_nbparam_t          *nbp,
                                             const cuda_dev_info_t *dev_info)
  {
      float       *ftmp, *coul_tab;
-    int         tabsize;
-    double      tabscale;
-    cudaError_t stat;
+    int          tabsize;
+    double       tabscale;
+    cudaError_t  stat;
  
      tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
      /* Subtract 2 iso 1 to avoid access out of range due to rounding */
@@ -182,7 +182,7 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
      cudaError_t stat;
  
      ad->ntypes  = ntypes;
-    stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
+    stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
      CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
      ad->bShiftVecUploaded = false;
  
@@ -259,25 +259,25 @@ static int pick_ewald_kernel_type(bool                   bTwinCut,
  
  
  /*! Initializes the nonbonded parameter data structure. */
-static void init_nbparam(cu_nbparam_t *nbp,
+static void init_nbparam(cu_nbparam_t              *nbp,
                           const interaction_const_t *ic,
-                         const nbnxn_atomdata_t *nbat,
-                         const cuda_dev_info_t *dev_info)
+                         const nbnxn_atomdata_t    *nbat,
+                         const cuda_dev_info_t     *dev_info)
  {
      cudaError_t stat;
      int         ntypes, nnbfp;
  
      ntypes  = nbat->ntype;
  
-    nbp->ewald_beta = ic->ewaldcoeff_q;
-    nbp->sh_ewald   = ic->sh_ewald;
-    nbp->epsfac     = ic->epsfac;
-    nbp->two_k_rf   = 2.0 * ic->k_rf;
-    nbp->c_rf       = ic->c_rf;
-    nbp->rvdw_sq    = ic->rvdw * ic->rvdw;
-    nbp->rcoulomb_sq= ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq   = ic->rlist * ic->rlist;
-    nbp->sh_invrc6  = ic->sh_invrc6;
+    nbp->ewald_beta  = ic->ewaldcoeff_q;
+    nbp->sh_ewald    = ic->sh_ewald;
+    nbp->epsfac      = ic->epsfac;
+    nbp->two_k_rf    = 2.0 * ic->k_rf;
+    nbp->c_rf        = ic->c_rf;
+    nbp->rvdw_sq     = ic->rvdw * ic->rvdw;
+    nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
+    nbp->rlist_sq    = ic->rlist * ic->rlist;
+    nbp->sh_invrc6   = ic->sh_invrc6;
  
      if (ic->eeltype == eelCUT)
      {
@@ -287,7 +287,7 @@ static void init_nbparam(cu_nbparam_t *nbp,
      {
          nbp->eeltype = eelCuRF;
      }
-    else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
+    else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
      {
          /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
          nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
@@ -306,41 +306,41 @@ static void init_nbparam(cu_nbparam_t *nbp,
      }
  
      nnbfp = 2*ntypes*ntypes;
-    stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
+    stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
      CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
      cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
  
  #ifdef TEXOBJ_SUPPORTED
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (dev_info->prop.major >= 3)
-        {
-            cudaResourceDesc rd;
-            memset(&rd, 0, sizeof(rd));
-            rd.resType                  = cudaResourceTypeLinear;
-            rd.res.linear.devPtr        = nbp->nbfp;
-            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
-            rd.res.linear.desc.x        = 32;
-            rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
-
-            cudaTextureDesc td;
-            memset(&td, 0, sizeof(td));
-            td.readMode                 = cudaReadModeElementType;
-            stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
-            CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
-        }
-        else
+    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+    if (dev_info->prop.major >= 3)
+    {
+        cudaResourceDesc rd;
+        memset(&rd, 0, sizeof(rd));
+        rd.resType                  = cudaResourceTypeLinear;
+        rd.res.linear.devPtr        = nbp->nbfp;
+        rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+        rd.res.linear.desc.x        = 32;
+        rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
+
+        cudaTextureDesc td;
+        memset(&td, 0, sizeof(td));
+        td.readMode                 = cudaReadModeElementType;
+        stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
+        CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
+    }
+    else
  #endif
-        {
-            cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
-            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
-                                   nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
-            CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
-        }
+    {
+        cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
+        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
+                               nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
+        CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
+    }
  }
  
  /*! Re-generate the GPU Ewald force table, resets rlist, and update the
   *  electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t           cu_nb,
                                           const interaction_const_t *ic)
  {
      cu_nbparam_t *nbp = cu_nb->nbparam;
@@ -379,7 +379,7 @@ static void init_plist(cu_plist_t *pl)
  static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
  {
      cudaError_t stat;
-    int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync: cudaEventDefault );
+    int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
  
      stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
      CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
@@ -419,12 +419,12 @@ static void init_timings(wallclock_gpu_t *t)
  
      t->nb_h2d_t = 0.0;
      t->nb_d2h_t = 0.0;
-    t->nb_c    = 0;
+    t->nb_c     = 0;
      t->pl_h2d_t = 0.0;
      t->pl_h2d_c = 0;
      for (i = 0; i < 2; i++)
      {
-        for(j = 0; j < 2; j++)
+        for (j = 0; j < 2; j++)
          {
              t->ktime[i][j].t = 0.0;
              t->ktime[i][j].c = 0;
@@ -432,22 +432,25 @@ static void init_timings(wallclock_gpu_t *t)
      }
  }
  
-void nbnxn_cuda_init(FILE *fplog,
-                     nbnxn_cuda_ptr_t *p_cu_nb,
+void nbnxn_cuda_init(FILE                 *fplog,
+                     nbnxn_cuda_ptr_t     *p_cu_nb,
                       const gmx_gpu_info_t *gpu_info,
-                     const gmx_gpu_opt_t *gpu_opt,
-                     int my_gpu_index,
-                     gmx_bool bLocalAndNonlocal)
+                     const gmx_gpu_opt_t  *gpu_opt,
+                     int                   my_gpu_index,
+                     gmx_bool              bLocalAndNonlocal)
  {
-    cudaError_t stat;
+    cudaError_t       stat;
      nbnxn_cuda_ptr_t  nb;
-    char sbuf[STRLEN];
-    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
-    int cuda_drv_ver;
+    char              sbuf[STRLEN];
+    bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
+    int               cuda_drv_ver;
  
      assert(gpu_info);
  
-    if (p_cu_nb == NULL) return;
+    if (p_cu_nb == NULL)
+    {
+        return;
+    }
  
      snew(nb, 1);
      snew(nb->atdat, 1);
@@ -516,7 +519,7 @@ void nbnxn_cuda_init(FILE *fplog,
       * With polling wait event-timing also needs to be disabled.
       *
       * The overhead is greatly reduced in API v5.0 drivers and the improvement
-     $ is independent of runtime version. Hence, with API v5.0 drivers and later
+     * is independent of runtime version. Hence, with API v5.0 drivers and later
       * we won't switch to polling.
       *
       * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
@@ -592,8 +595,8 @@ void nbnxn_cuda_init(FILE *fplog,
                          "      However, the polling wait workaround can not be used because\n%s\n"
                          "      Consider updating the driver or turning ECC off.",
                          (bX86 && bTMPIAtomics) ?
-                            "      GPU(s) are being oversubscribed." :
-                            "      atomic operations are not supported by the platform/CPU+compiler.");
+                        "      GPU(s) are being oversubscribed." :
+                        "      atomic operations are not supported by the platform/CPU+compiler.");
                  md_print_warn(fplog, sbuf);
              }
          }
@@ -651,15 +654,15 @@ void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t                cu_nb,
      nbnxn_cuda_clear_e_fshift(cu_nb);
  }
  
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
                                const nbnxn_pairlist_t *h_plist,
-                              int iloc)
+                              int                     iloc)
  {
-    char         sbuf[STRLEN];
-    cudaError_t  stat;
-    bool         bDoTime    = cu_nb->bDoTime;
-    cudaStream_t stream     = cu_nb->stream[iloc];
-    cu_plist_t   *d_plist   = cu_nb->plist[iloc];
+    char          sbuf[STRLEN];
+    cudaError_t   stat;
+    bool          bDoTime    = cu_nb->bDoTime;
+    cudaStream_t  stream     = cu_nb->stream[iloc];
+    cu_plist_t   *d_plist    = cu_nb->plist[iloc];
  
      if (d_plist->na_c < 0)
      {
@@ -682,19 +685,19 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
      }
  
      cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
-                         &d_plist->nsci, &d_plist->sci_nalloc,
-                         h_plist->nsci,
-                         stream, true);
+                        &d_plist->nsci, &d_plist->sci_nalloc,
+                        h_plist->nsci,
+                        stream, true);
  
      cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
-                         &d_plist->ncj4, &d_plist->cj4_nalloc,
-                         h_plist->ncj4,
-                         stream, true);
+                        &d_plist->ncj4, &d_plist->cj4_nalloc,
+                        h_plist->ncj4,
+                        stream, true);
  
      cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
-                         &d_plist->nexcl, &d_plist->excl_nalloc,
-                         h_plist->nexcl,
-                         stream, true);
+                        &d_plist->nexcl, &d_plist->excl_nalloc,
+                        h_plist->nexcl,
+                        stream, true);
  
      if (bDoTime)
      {
@@ -706,16 +709,16 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
      d_plist->bDoPrune = true;
  }
  
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t        cu_nb,
                                  const nbnxn_atomdata_t *nbatom)
  {
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
      {
-        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 
+        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
                            SHIFTS * sizeof(*adat->shift_vec), ls);
          adat->bShiftVecUploaded = true;
      }
@@ -724,9 +727,9 @@ void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
  /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
  static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
  {
-    cudaError_t   stat;
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cudaError_t    stat;
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
  
      stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -735,9 +738,9 @@ static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
  /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
  static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
  {
-    cudaError_t   stat;
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cudaError_t    stat;
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
  
      stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -750,7 +753,7 @@ static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
  void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
  {
      nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
-    /* clear shift force array and energies if the outputs were 
+    /* clear shift force array and energies if the outputs were
         used in the current step */
      if (flags & GMX_FORCE_VIRIAL)
      {
@@ -758,18 +761,18 @@ void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
      }
  }
  
-void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t        cu_nb,
                                const nbnxn_atomdata_t *nbat)
  {
-    cudaError_t   stat;
-    int           nalloc, natoms;
-    bool          realloced;
-    bool          bDoTime   = cu_nb->bDoTime;
-    cu_timers_t   *timers   = cu_nb->timers;
-    cu_atomdata_t *d_atdat  = cu_nb->atdat;
-    cudaStream_t  ls        = cu_nb->stream[eintLocal];
-
-    natoms = nbat->natoms;
+    cudaError_t    stat;
+    int            nalloc, natoms;
+    bool           realloced;
+    bool           bDoTime   = cu_nb->bDoTime;
+    cu_timers_t   *timers    = cu_nb->timers;
+    cu_atomdata_t *d_atdat   = cu_nb->atdat;
+    cudaStream_t   ls        = cu_nb->stream[eintLocal];
+
+    natoms    = nbat->natoms;
      realloced = false;
  
      if (bDoTime)
@@ -802,10 +805,10 @@ void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
          CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
  
          d_atdat->nalloc = nalloc;
-        realloced = true;
+        realloced       = true;
      }
  
-    d_atdat->natoms = natoms;
+    d_atdat->natoms       = natoms;
      d_atdat->natoms_local = nbat->natoms_local;
  
      /* need to clear GPU f output if realloc happened */
@@ -826,13 +829,16 @@ void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
  
  void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
  {
-    cudaError_t     stat;
+    cudaError_t      stat;
      cu_atomdata_t   *atdat;
      cu_nbparam_t    *nbparam;
      cu_plist_t      *plist, *plist_nl;
      cu_timers_t     *timers;
  
-    if (cu_nb == NULL) return;
+    if (cu_nb == NULL)
+    {
+        return;
+    }
  
      atdat       = cu_nb->atdat;
      nbparam     = cu_nb->nbparam;
@@ -957,7 +963,7 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
  
  void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
  {
-    cudaError_t stat;
+    cudaError_t  stat;
      cudaStream_t stream = cu_nb->stream[iloc];
  
      stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
@@ -980,7 +986,7 @@ void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
  int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
  {
      return cu_nb != NULL ?
-        gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
+           gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
  
  }
  
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 366ae8c1880e3ecfd856d234ac3c5e070f3512d7..33214acd45380c919da473f7eb266d8698b63896 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -60,7 +60,7 @@
      Each thread calculates an i force-component taking one pair of i-j atoms.
   */
  #if __CUDA_ARCH__ >= 350
-__launch_bounds__(64,16)
+__launch_bounds__(64, 16)
  #endif
  #ifdef PRUNE_NBL
  #ifdef CALC_ENERGIES
@@ -75,27 +75,27 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener)
  __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #endif
  #endif
-            (const cu_atomdata_t atdat,
-             const cu_nbparam_t nbparam,
-             const cu_plist_t plist,
-             bool bCalcFshift)
+(const cu_atomdata_t atdat,
+ const cu_nbparam_t nbparam,
+ const cu_plist_t plist,
+ bool bCalcFshift)
  {
      /* convenience variables */
-    const nbnxn_sci_t *pl_sci   = plist.sci;
+    const nbnxn_sci_t *pl_sci       = plist.sci;
  #ifndef PRUNE_NBL
      const
  #endif
-    nbnxn_cj4_t *pl_cj4         = plist.cj4;
-    const nbnxn_excl_t *excl    = plist.excl;
-    const int *atom_types       = atdat.atom_types;
-    int ntypes                  = atdat.ntypes;
-    const float4 *xq            = atdat.xq;
-    float3 *f                   = atdat.f;
-    const float3 *shift_vec     = atdat.shift_vec;
-    float rcoulomb_sq           = nbparam.rcoulomb_sq;
+    nbnxn_cj4_t        *pl_cj4      = plist.cj4;
+    const nbnxn_excl_t *excl        = plist.excl;
+    const int          *atom_types  = atdat.atom_types;
+    int                 ntypes      = atdat.ntypes;
+    const float4       *xq          = atdat.xq;
+    float3             *f           = atdat.f;
+    const float3       *shift_vec   = atdat.shift_vec;
+    float               rcoulomb_sq = nbparam.rcoulomb_sq;
  #ifdef VDW_CUTOFF_CHECK
-    float rvdw_sq               = nbparam.rvdw_sq;
-    float vdw_in_range;
+    float               rvdw_sq     = nbparam.rvdw_sq;
+    float               vdw_in_range;
  #endif
  #ifdef EL_RF
      float two_k_rf              = nbparam.two_k_rf;
@@ -112,12 +112,12 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #endif
  
  #ifdef CALC_ENERGIES
-    float lj_shift    = nbparam.sh_invrc6;
+    float  lj_shift    = nbparam.sh_invrc6;
  #ifdef EL_EWALD_ANY
-    float beta        = nbparam.ewald_beta;
-    float ewald_shift = nbparam.sh_ewald;
+    float  beta        = nbparam.ewald_beta;
+    float  ewald_shift = nbparam.sh_ewald;
  #else
-    float c_rf        = nbparam.c_rf;
+    float  c_rf        = nbparam.c_rf;
  #endif
      float *e_lj       = atdat.e_lj;
      float *e_el       = atdat.e_el;
@@ -130,24 +130,24 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
      unsigned int bidx   = blockIdx.x;
      unsigned int widx   = tidx / WARP_SIZE; /* warp index */
  
-    int sci, ci, cj, ci_offset,
-        ai, aj,
-        cij4_start, cij4_end,
-        typei, typej,
-        i, jm, j4, wexcl_idx;
-    float qi, qj_f,
-          r2, inv_r, inv_r2, inv_r6,
-          c6, c12,
-          int_bit,
+    int          sci, ci, cj, ci_offset,
+                 ai, aj,
+                 cij4_start, cij4_end,
+                 typei, typej,
+                 i, jm, j4, wexcl_idx;
+    float        qi, qj_f,
+                 r2, inv_r, inv_r2, inv_r6,
+                 c6, c12,
+                 int_bit,
+                 F_invr;
  #ifdef CALC_ENERGIES
-          E_lj, E_el, E_lj_p,
+    float        E_lj, E_el, E_lj_p;
  #endif
-          F_invr;
      unsigned int wexcl, imask, mask_ji;
-    float4 xqbuf;
-    float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
-    float3 fci_buf[NCL_PER_SUPERCL];    /* i force buffer */
-    nbnxn_sci_t nb_sci;
+    float4       xqbuf;
+    float3       xi, xj, rv, f_ij, fcj_buf, fshift_buf;
+    float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+    nbnxn_sci_t  nb_sci;
  
      /* shmem buffer for i x+q pre-loading */
      extern __shared__  float4 xqib[];
@@ -185,7 +185,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #endif
      __syncthreads();
  
-    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
      {
          fci_buf[ci_offset] = make_float3(0.0f);
      }
@@ -268,11 +268,11 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD_ANY || defined EL_RF))
  #pragma unroll 8
  #endif
-                    for(i = 0; i < NCL_PER_SUPERCL; i++)
+                    for (i = 0; i < NCL_PER_SUPERCL; i++)
                      {
                          if (imask & mask_ji)
                          {
-                            ci_offset   = i;    /* i force buffer offset */
+                            ci_offset   = i;                     /* i force buffer offset */
  
                              ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
                              ai      = ci * CL_SIZE + tidxi;      /* i atom index */
@@ -320,7 +320,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #else
                                  c6      = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej));
                                  c12     = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1);
-#endif /* USE_TEXOBJ */
+#endif                          /* USE_TEXOBJ */
  
  
                                  /* avoid NaN for excluded pairs at r=0 */
@@ -344,7 +344,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #ifdef VDW_CUTOFF_CHECK
                                  /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
                                  vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
-                                F_invr  *= vdw_in_range;
+                                F_invr      *= vdw_in_range;
  #ifdef CALC_ENERGIES
                                  E_lj_p  *= vdw_in_range;
  #endif
@@ -382,7 +382,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #ifdef EL_EWALD_ANY
                                  /* 1.0f - erff is faster than erfcf */
                                  E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
-#endif /* EL_EWALD_ANY */
+#endif                          /* EL_EWALD_ANY */
  #endif
                                  f_ij    = rv * F_invr;
  
@@ -420,7 +420,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
      }
  
      /* reduce i forces */
-    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
      {
          ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
  #ifdef REDUCE_SHUFFLE
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

index 66f16c430e1ef74bbea980c20efd779a50aa428b..bf7c8cfacfc8393cfb57618b54abd904b00129e9 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -58,12 +58,12 @@ static inline __device__
  float interpolate_coulomb_force_r(float r, float scale)
  {
      float   normalized = scale * r;
-    int     index = (int) normalized;
-    float   fract2 = normalized - index;
-    float   fract1 = 1.0f - fract2;
+    int     index      = (int) normalized;
+    float   fract2     = normalized - index;
+    float   fract1     = 1.0f - fract2;
  
-    return  fract1 * tex1Dfetch(coulomb_tab_texref, index)
-            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
+    return fract1 * tex1Dfetch(coulomb_tab_texref, index)
+           + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
  }
  
  #ifdef TEXOBJ_SUPPORTED
@@ -72,12 +72,12 @@ float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
                                    float r, float scale)
  {
      float   normalized = scale * r;
-    int     index = (int) normalized;
-    float   fract2 = normalized - index;
-    float   fract1 = 1.0f - fract2;
+    int     index      = (int) normalized;
+    float   fract2     = normalized - index;
+    float   fract1     = 1.0f - fract2;
  
-    return  fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
-            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
+    return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
+           fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
  }
  #endif
  
@@ -101,7 +101,7 @@ float pmecorrF(float z2)
      const float FD0 = 1.0f;
  
      float       z4;
-    float       polyFN0,polyFN1,polyFD0,polyFD1;
+    float       polyFN0, polyFN1, polyFD0, polyFD1;
  
      z4          = z2*z2;
  
@@ -344,16 +344,16 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
  #pragma unroll 5
      for (i = 0; i < 5; i++)
      {
-        E_lj += __shfl_down(E_lj,sh);
-        E_el += __shfl_down(E_el,sh);
-        sh += sh;
+        E_lj += __shfl_down(E_lj, sh);
+        E_el += __shfl_down(E_el, sh);
+        sh   += sh;
      }
  
      /* The first thread in the warp writes the reduced energies */
      if (tidx == 0 || tidx == WARP_SIZE)
      {
-        atomicAdd(e_lj,E_lj);
-        atomicAdd(e_el,E_el);
+        atomicAdd(e_lj, E_lj);
+        atomicAdd(e_el, E_el);
      }
  }
  #endif /* __CUDA_ARCH__ */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh

index 608f826b77da2699e9820da4d29f430d80ee8b19..45640b17c1d7d8a32d184b0339733622d182c7e1 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -46,14 +46,14 @@
  
  /* Analytical plain cut-off kernels */
  #define EL_CUTOFF
-#define NB_KERNEL_FUNC_NAME(x,...) x##_cutoff##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _cutoff ## __VA_ARGS__
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_CUTOFF
  #undef NB_KERNEL_FUNC_NAME
  
  /* Analytical reaction-field kernels */
  #define EL_RF
-#define NB_KERNEL_FUNC_NAME(x,...) x##_rf##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _rf ## __VA_ARGS__
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_RF
  #undef NB_KERNEL_FUNC_NAME
@@ -61,7 +61,7 @@
  /* Analytical Ewald interaction kernels
   */
  #define EL_EWALD_ANA
-#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ewald ## __VA_ARGS__
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_EWALD_ANA
  #undef NB_KERNEL_FUNC_NAME
@@ -70,7 +70,7 @@
   */
  #define EL_EWALD_ANA
  #define VDW_CUTOFF_CHECK
-#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_twin##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ewald_twin ## __VA_ARGS__
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_EWALD_ANA
  #undef VDW_CUTOFF_CHECK
@@ -78,7 +78,7 @@
  
  /* Tabulated Ewald interaction kernels */
  #define EL_EWALD_TAB
-#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ewald_tab ## __VA_ARGS__
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_EWALD_TAB
  #undef NB_KERNEL_FUNC_NAME
@@ -86,7 +86,7 @@
  /* Tabulated Ewald interaction kernels with twin-range cut-off */
  #define EL_EWALD_TAB
  #define VDW_CUTOFF_CHECK
-#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab_twin##__VA_ARGS__
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ewald_tab_twin ## __VA_ARGS__
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_EWALD_TAB
  #undef VDW_CUTOFF_CHECK
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h

index ac9bea9dd2a9f7b14892dd2000b2540a0782ec42..f8d7a8d076195210c35b30c408bcfd22a80ae972 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -126,13 +126,13 @@ struct cu_nbparam
      float    sh_invrc6;      /**< LJ potential correction term                      */
  
      /* Non-bonded parameters - accessed through texture memory */
-    float            *nbfp;          /**< nonbonded parameter table with C6/C12 pairs  */
+    float              *nbfp;        /**< nonbonded parameter table with C6/C12 pairs  */
      cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp                 */
  
      /* Ewald Coulomb force table data - accessed through texture memory */
-    int               coulomb_tab_size;      /**< table size (s.t. it fits in texture cache) */
-    float             coulomb_tab_scale;     /**< table scale/spacing                        */
-    float            *coulomb_tab;           /**< pointer to the table in the device memory  */
+    int                  coulomb_tab_size;   /**< table size (s.t. it fits in texture cache) */
+    float                coulomb_tab_scale;  /**< table scale/spacing                        */
+    float               *coulomb_tab;        /**< pointer to the table in the device memory  */
      cudaTextureObject_t  coulomb_tab_texobj; /**< texture object bound to coulomb_tab        */
  };
  
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h

index 1740ccac8b7787c174410965b67f69c988bd228c..8ba1726f2e44769901b907183600f0445b864d5f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@ -159,7 +159,7 @@ gmx_sum_simd2(gmx_mm_pr x, real* b)
      return b[0]+b[1];
  }
  
-#if GMX_SIMD_WIDTH_HERE>=4
+#if GMX_SIMD_WIDTH_HERE >= 4
  static gmx_inline real
  gmx_sum_simd4(gmx_mm_pr4 x, real* b)
  {
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h

index 0c02fe72a8dc537e953c28ea75b371e4244a13e7..c54c6ae79a819309931060f52ba327dac74fa1bf 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
@@ -176,7 +176,7 @@ prepare_table_load_buffer(const int *array)
     and low/high half after each other, then simply doing a gather for tab_coul_F and tab_coul_F+1.
     The ording of the 16 elements doesn't matter, so it doesn't help to get FD sorted as odd/even
     instead of low/high.
-*/
+ */
  static gmx_inline void
  load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int *ti,
               gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S)
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h

index 1422295d5e6ea1304156783ae9dfff3d99f6d77a..bbff9ed3681729a90ff1267606618a6f8ab76470 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
@@ -701,7 +701,7 @@
  #else
      fscal_S0    = gmx_mul_pr(rinvsq_S0,
                               (
-                                        gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+                                 gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
  #endif
  #else
      fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
@@ -714,7 +714,7 @@
  #else
      fscal_S2    = gmx_mul_pr(rinvsq_S2,
                               (
-                                        gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+                                 gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
  #endif
  #else
      /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h

index 6da784bc8cc3f72db0c05723d3d4a643b609ae59..0817d76793ce90f4c4ddf8891208bd5ae590af45 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
@@ -374,7 +374,7 @@
  #endif
  #endif
  #else /* EXCL_FORCES */
-    /* No exclusion forces: remove all excluded atom pairs from the list */
+      /* No exclusion forces: remove all excluded atom pairs from the list */
      wco_S0      = gmx_and_pb(wco_S0, interact_S0);
      wco_S1      = gmx_and_pb(wco_S1, interact_S1);
      wco_S2      = gmx_and_pb(wco_S2, interact_S2);
@@ -906,7 +906,7 @@
  #else
      fscal_S0    = gmx_mul_pr(rinvsq_S0,
                               (
-                                        gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+                                 gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
  #endif
  #ifdef CALC_COULOMB
      fscal_S1    = gmx_mul_pr(rinvsq_S1,
@@ -915,7 +915,7 @@
  #else
      fscal_S1    = gmx_mul_pr(rinvsq_S1,
                               (
-                                        gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
+                                 gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
  #endif
  #else
      fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
@@ -929,7 +929,7 @@
  #else
      fscal_S2    = gmx_mul_pr(rinvsq_S2,
                               (
-                                        gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+                                 gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
  #endif
  #ifdef CALC_COULOMB
      fscal_S3    = gmx_mul_pr(rinvsq_S3,
@@ -938,7 +938,7 @@
  #else
      fscal_S3    = gmx_mul_pr(rinvsq_S3,
                               (
-                                        gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
+                                 gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
  #endif
  #else
      /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
diff --git a/src/gromacs/mdlib/ns.c b/src/gromacs/mdlib/ns.c

index 5d0e997afe1c95e7773a6078bb209bcda1f1f817..f776001169b312ecdd338545bc1be4dafe995bdc 100644 (file)
--- a/src/gromacs/mdlib/ns.c
+++ b/src/gromacs/mdlib/ns.c
@@ -405,9 +405,9 @@ static inline void new_i_nblist(t_nblist *nlist, atom_id i_atom, int shift, int
      else
      {
          /* Adding to previous list. First remove possible previous padding */
-        if(nlist->simd_padding_width>1)
+        if (nlist->simd_padding_width > 1)
          {
-            while(nlist->nrj>0 && nlist->jjnr[nlist->nrj-1]<0)
+            while (nlist->nrj > 0 && nlist->jjnr[nlist->nrj-1] < 0)
              {
                  nlist->nrj--;
              }
diff --git a/src/gromacs/mdlib/pme_simd4.h b/src/gromacs/mdlib/pme_simd4.h

index d432a82a5b9cca54d3e3b3e5a91c2da8a62a36b7..1b6f0b0314ef5c3cd746d241a027635ae5c1e054 100644 (file)
--- a/src/gromacs/mdlib/pme_simd4.h
+++ b/src/gromacs/mdlib/pme_simd4.h
@@ -117,14 +117,14 @@
      for (ithx = 0; (ithx < 4); ithx++)
      {
          index_x  = (i0+ithx)*pny*pnz;
-        tx_S   = gmx_simd4_set1_pr(thx[ithx]);
-        dx_S   = gmx_simd4_set1_pr(dthx[ithx]);
+        tx_S     = gmx_simd4_set1_pr(thx[ithx]);
+        dx_S     = gmx_simd4_set1_pr(dthx[ithx]);
  
          for (ithy = 0; (ithy < 4); ithy++)
          {
              index_xy = index_x+(j0+ithy)*pnz;
-            ty_S   = gmx_simd4_set1_pr(thy[ithy]);
-            dy_S   = gmx_simd4_set1_pr(dthy[ithy]);
+            ty_S     = gmx_simd4_set1_pr(thy[ithy]);
+            dy_S     = gmx_simd4_set1_pr(dthy[ithy]);
  
              gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0);
  
@@ -155,8 +155,8 @@
   * This code supports pme_order <= 5.
   */
  {
-    int    offset;
-    int    index;
+    int          offset;
+    int          index;
      gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
      gmx_simd4_pr tz_S0;
      gmx_simd4_pr tz_S1;
@@ -263,7 +263,7 @@
   * This code supports pme_order <= 5.
   */
  {
-    int    offset;
+    int          offset;
  
      real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
  
@@ -316,14 +316,14 @@
      for (ithx = 0; (ithx < PME_ORDER); ithx++)
      {
          index_x  = (i0+ithx)*pny*pnz;
-        tx_S   = gmx_simd4_set1_pr(thx[ithx]);
-        dx_S   = gmx_simd4_set1_pr(dthx[ithx]);
+        tx_S     = gmx_simd4_set1_pr(thx[ithx]);
+        dx_S     = gmx_simd4_set1_pr(dthx[ithx]);
  
          for (ithy = 0; (ithy < PME_ORDER); ithy++)
          {
              index_xy = index_x+(j0+ithy)*pnz;
-            ty_S   = gmx_simd4_set1_pr(thy[ithy]);
-            dy_S   = gmx_simd4_set1_pr(dthy[ithy]);
+            ty_S     = gmx_simd4_set1_pr(thy[ithy]);
+            dy_S     = gmx_simd4_set1_pr(dthy[ithy]);
  
              gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset);
              gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4);
diff --git a/src/gromacs/mdlib/qm_orca.c b/src/gromacs/mdlib/qm_orca.c

index 0d5535a2d19a522876aa490d6e8e00c5a0c28f5d..f9f87ce69c01aeb3f16fc85d77af5e50a3d0f9b2 100644 (file)
--- a/src/gromacs/mdlib/qm_orca.c
+++ b/src/gromacs/mdlib/qm_orca.c
@@ -108,10 +108,10 @@ void init_orca(t_QMrec *qm)
  
  void write_orca_input(t_forcerec *fr, t_QMrec *qm, t_MMrec *mm)
  {
-    int i;
+    int        i;
      t_QMMMrec *QMMMrec;
-    FILE *out, *pcFile, *addInputFile, *LJCoeff;
-    char *buf, *orcaInput, *addInputFilename, *LJCoeffFilename, *pcFilename, *exclInName, *exclOutName;
+    FILE      *out, *pcFile, *addInputFile, *LJCoeff;
+    char      *buf, *orcaInput, *addInputFilename, *LJCoeffFilename, *pcFilename, *exclInName, *exclOutName;
  
      QMMMrec = fr->qr;
  
diff --git a/src/gromacs/simd/macros.h b/src/gromacs/simd/macros.h

index f1f848ba15a1ad8e7e54c447ae269bd24ac81a03..ca1e499a6142f2737c7e8f612d1d0929a9cc849a 100644 (file)
--- a/src/gromacs/simd/macros.h
+++ b/src/gromacs/simd/macros.h
@@ -757,10 +757,10 @@ static gmx_inline gmx_mm_pr gmx_always_inline gmx_atan2_pr(gmx_mm_pr a, gmx_mm_p
  
  static gmx_inline gmx_mm_pr gmx_always_inline gmx_erfc_pr(gmx_mm_pr a)
  {
-  /* The BG/Q qpxmath.h vector math library intended for use with
-     bgclang does not have erfc, so we need to use a function from
-     mass_simd.h. If this changes, then the #include <mass_simd.h> can
-     become conditional. */ 
+    /* The BG/Q qpxmath.h vector math library intended for use with
+       bgclang does not have erfc, so we need to use a function from
+       mass_simd.h. If this changes, then the #include <mass_simd.h> can
+       become conditional. */
  #ifndef GMX_DOUBLE
      return erfcf4(a);
  #else
diff --git a/src/gromacs/simd/math_single.h b/src/gromacs/simd/math_single.h

index dc42e0ca0c18e2efd9ed5c0cc19d7d195664e33f..377855c549b9bbca892a5399539a2575ef52c427 100644 (file)
--- a/src/gromacs/simd/math_single.h
+++ b/src/gromacs/simd/math_single.h
@@ -56,7 +56,7 @@ gmx_invsqrt_pr(gmx_mm_pr x)
      const gmx_mm_pr three = gmx_set1_pr(3.0);
  
      gmx_mm_pr       lu = gmx_rsqrt_pr(x);
-    
+
      return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu));
  #endif
  }
diff --git a/src/gromacs/tools/dump.c b/src/gromacs/tools/dump.c

index 17aee79cf01f72c0ae2bf5a96c7629186c5c0409..452f2c5e3995d52414b4379b52a333b53b4eebc8 100644 (file)
--- a/src/gromacs/tools/dump.c
+++ b/src/gromacs/tools/dump.c
@@ -319,15 +319,15 @@ void list_xtc(const char *fn)
  
  /*! \brief Callback used by list_tng_for_gmx_dump. */
  static void list_tng_inner(const char *fn,
-                    gmx_bool bFirstFrame,
-                    real *values,
-                    gmx_int64_t step, 
-                    double frame_time,
-                    gmx_int64_t n_values_per_frame,
-                    gmx_int64_t n_atoms,
-                    real prec,
-                    gmx_int64_t nframe,
-                    char *block_name)
+                           gmx_bool    bFirstFrame,
+                           real       *values,
+                           gmx_int64_t step,
+                           double      frame_time,
+                           gmx_int64_t n_values_per_frame,
+                           gmx_int64_t n_atoms,
+                           real        prec,
+                           gmx_int64_t nframe,
+                           char       *block_name)
  {
      char                 buf[256];
      int                  indent = 0;
diff --git a/src/gromacs/utility/init.cpp b/src/gromacs/utility/init.cpp

index 03569a6d99282813f1fe6d862187c3f8726bcc9d..2413ca5b7d98b9051df62cce8293837a0bc581be 100644 (file)
--- a/src/gromacs/utility/init.cpp
+++ b/src/gromacs/utility/init.cpp
@@ -63,8 +63,8 @@ namespace gmx
  namespace
  {
  #ifdef GMX_LIB_MPI
-    //! Maintains global counter of attempts to initialize MPI
-    int g_initializationCounter = 0;
+//! Maintains global counter of attempts to initialize MPI
+int g_initializationCounter = 0;
  #endif
  }
  
diff --git a/src/programs/mdrun/tests/compressed_x_output.cpp b/src/programs/mdrun/tests/compressed_x_output.cpp

index 2b1204996845862a6a7423e747b6511359a66583..bfaf6b8cf5b82084f422f1074db1c9e5adb09c7e 100644 (file)
--- a/src/programs/mdrun/tests/compressed_x_output.cpp
+++ b/src/programs/mdrun/tests/compressed_x_output.cpp
@@ -52,7 +52,7 @@ namespace
  
  //! Test fixture for mdrun -x
  class CompressedXOutputTest : public gmx::test::MdrunTestFixture,
-                      public testing::WithParamInterface<const char*>
+                              public testing::WithParamInterface<const char*>
  {
  };
  
diff --git a/src/programs/mdrun/tests/rerun.cpp b/src/programs/mdrun/tests/rerun.cpp

index 8ba5d6b559fbc47cd72165e797abaa995a28b4ac..d1df11f8a38d85a27e9302288b5677be96990095 100644 (file)
--- a/src/programs/mdrun/tests/rerun.cpp
+++ b/src/programs/mdrun/tests/rerun.cpp
@@ -92,7 +92,7 @@ const char *trajectoryFileNames[] = {
  
  INSTANTIATE_TEST_CASE_P(NoFatalErrorFrom,
                          MdrunRerun,
-                        ::testing::ValuesIn(gmx::ArrayRef<const char*>(trajectoryFileNames)));
+                            ::testing::ValuesIn(gmx::ArrayRef<const char*>(trajectoryFileNames)));
  
  /*! \todo Add other tests for mdrun -rerun, e.g.
   *
author	Roland Schulz <roland@utk.edu>
	Sat, 18 Jan 2014 04:47:30 +0000 (23:47 -0500)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Wed, 29 Jan 2014 19:19:43 +0000 (20:19 +0100)
.gitattributes		patch \| blob \| history
admin/reformat_all.sh	[new file with mode: 0755]	patch \| blob
src/gromacs/fileio/gmxfio_int.h		patch \| blob \| history
src/gromacs/fileio/mdoutf.c		patch \| blob \| history
src/gromacs/fileio/timecontrol.c		patch \| blob \| history
src/gromacs/fileio/tngio.cpp		patch \| blob \| history
src/gromacs/fileio/tngio_for_tools.cpp		patch \| blob \| history
src/gromacs/gmxana/cmat.c		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/cudautils.cu		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/cudautils.cuh		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/vectype_ops.cuh		patch \| blob \| history
src/gromacs/gmxlib/gmx_detect_hardware.c		patch \| blob \| history
src/gromacs/gmxlib/gmx_fatal.c		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h		patch \| blob \| history
src/gromacs/gmxpreprocess/gen_vsite.c		patch \| blob \| history
src/gromacs/gmxpreprocess/readpull.c		patch \| blob \| history
src/gromacs/gmxpreprocess/toppush.c		patch \| blob \| history
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h		patch \| blob \| history
src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h		patch \| blob \| history
src/gromacs/mdlib/ns.c		patch \| blob \| history
src/gromacs/mdlib/pme_simd4.h		patch \| blob \| history
src/gromacs/mdlib/qm_orca.c		patch \| blob \| history
src/gromacs/simd/macros.h		patch \| blob \| history
src/gromacs/simd/math_single.h		patch \| blob \| history
src/gromacs/tools/dump.c		patch \| blob \| history
src/gromacs/utility/init.cpp		patch \| blob \| history
src/programs/mdrun/tests/compressed_x_output.cpp		patch \| blob \| history
src/programs/mdrun/tests/rerun.cpp		patch \| blob \| history