SIMD acceleration for RB dihedrals

[alexxy/gromacs.git] / src / gromacs / gmxlib / bondfree.c
diff --git a/src/gromacs/gmxlib/bondfree.c b/src/gromacs/gmxlib/bondfree.c

index 1aa794714c8c429139034e29112875be291bc72c..0eb8d55791b983c0b5909786b7dffdb1c5ef462a 100644 (file)
--- a/src/gromacs/gmxlib/bondfree.c
+++ b/src/gromacs/gmxlib/bondfree.c
@@ -1,49 +1,51 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
- *
- *
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
- *
- *          GROningen MAchine for Chemical Simulations
+/*
+ * This file is part of the GROMACS molecular simulation package.
   *
- *                        VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
   * of the License, or (at your option) any later version.
   *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
   *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
   *
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
   *
- * And Hey:
- * GROningen Mixture of Alchemy and Childrens' Stories
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
   */
  #ifdef HAVE_CONFIG_H
  #include <config.h>
  #endif
  
  #include <math.h>
+#include <assert.h>
  #include "physics.h"
  #include "vec.h"
-#include "maths.h"
+#include "gromacs/math/utilities.h"
  #include "txtdump.h"
  #include "bondf.h"
-#include "smalloc.h"
+#include "gromacs/utility/smalloc.h"
  #include "pbc.h"
  #include "ns.h"
  #include "macros.h"
@@ -55,12 +57,11 @@
  #include "orires.h"
  #include "force.h"
  #include "nonbonded.h"
+#include "restcbt.h"
  
-#ifdef GMX_X86_SSE2
-#define SIMD_BONDEDS
-
-#include "gmx_simd_macros.h"
-#endif
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/simd/vector_operations.h"
  
  /* Find a better place for this? */
  const int cmap_coeff_matrix[] = {
@@ -113,67 +114,19 @@ static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
      }
  }
  
-#ifdef SIMD_BONDEDS
-
-/* Below are 3 SIMD vector operations.
- * Currently these are only used here, but they should be moved to
- * a general SIMD include file when used elsewhere.
- */
-
-/* SIMD inner-product of multiple vectors */
-static gmx_inline gmx_mm_pr
-gmx_iprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
-             gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz)
-{
-    gmx_mm_pr ret;
-
-    ret = gmx_mul_pr(ax, bx);
-    ret = gmx_madd_pr(ay, by, ret);
-    ret = gmx_madd_pr(az, bz, ret);
-
-    return ret;
-}
-
-/* SIMD norm squared of multiple vectors */
-static gmx_inline gmx_mm_pr
-gmx_norm2_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az)
-{
-    gmx_mm_pr ret;
-
-    ret = gmx_mul_pr(ax, ax);
-    ret = gmx_madd_pr(ay, ay, ret);
-    ret = gmx_madd_pr(az, az, ret);
-
-    return ret;
-}
-
-/* SIMD cross-product of multiple vectors */
-static gmx_inline void
-gmx_cprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
-             gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz,
-             gmx_mm_pr *cx, gmx_mm_pr *cy, gmx_mm_pr *cz)
-{
-    *cx = gmx_mul_pr(ay, bz);
-    *cx = gmx_nmsub_pr(az, by, *cx);
-
-    *cy = gmx_mul_pr(az, bx);
-    *cy = gmx_nmsub_pr(ax, bz, *cy);
-
-    *cz = gmx_mul_pr(ax, by);
-    *cz = gmx_nmsub_pr(ay, bx, *cz);
-}
+#ifdef GMX_SIMD_HAVE_REAL
  
  /* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
  typedef struct {
-    gmx_mm_pr inv_bzz;
-    gmx_mm_pr inv_byy;
-    gmx_mm_pr inv_bxx;
-    gmx_mm_pr bzx;
-    gmx_mm_pr bzy;
-    gmx_mm_pr bzz;
-    gmx_mm_pr byx;
-    gmx_mm_pr byy;
-    gmx_mm_pr bxx;
+    gmx_simd_real_t inv_bzz;
+    gmx_simd_real_t inv_byy;
+    gmx_simd_real_t inv_bxx;
+    gmx_simd_real_t bzx;
+    gmx_simd_real_t bzy;
+    gmx_simd_real_t bzz;
+    gmx_simd_real_t byx;
+    gmx_simd_real_t byy;
+    gmx_simd_real_t bxx;
  } pbc_simd_t;
  
  /* Set the SIMD pbc data from a normal t_pbc struct */
@@ -192,51 +145,51 @@ static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
          }
      }
  
-    pbc_simd->inv_bzz = gmx_set1_pr(inv_bdiag[ZZ]);
-    pbc_simd->inv_byy = gmx_set1_pr(inv_bdiag[YY]);
-    pbc_simd->inv_bxx = gmx_set1_pr(inv_bdiag[XX]);
+    pbc_simd->inv_bzz = gmx_simd_set1_r(inv_bdiag[ZZ]);
+    pbc_simd->inv_byy = gmx_simd_set1_r(inv_bdiag[YY]);
+    pbc_simd->inv_bxx = gmx_simd_set1_r(inv_bdiag[XX]);
  
      if (pbc != NULL)
      {
-        pbc_simd->bzx = gmx_set1_pr(pbc->box[ZZ][XX]);
-        pbc_simd->bzy = gmx_set1_pr(pbc->box[ZZ][YY]);
-        pbc_simd->bzz = gmx_set1_pr(pbc->box[ZZ][ZZ]);
-        pbc_simd->byx = gmx_set1_pr(pbc->box[YY][XX]);
-        pbc_simd->byy = gmx_set1_pr(pbc->box[YY][YY]);
-        pbc_simd->bxx = gmx_set1_pr(pbc->box[XX][XX]);
+        pbc_simd->bzx = gmx_simd_set1_r(pbc->box[ZZ][XX]);
+        pbc_simd->bzy = gmx_simd_set1_r(pbc->box[ZZ][YY]);
+        pbc_simd->bzz = gmx_simd_set1_r(pbc->box[ZZ][ZZ]);
+        pbc_simd->byx = gmx_simd_set1_r(pbc->box[YY][XX]);
+        pbc_simd->byy = gmx_simd_set1_r(pbc->box[YY][YY]);
+        pbc_simd->bxx = gmx_simd_set1_r(pbc->box[XX][XX]);
      }
      else
      {
-        pbc_simd->bzx = gmx_setzero_pr();
-        pbc_simd->bzy = gmx_setzero_pr();
-        pbc_simd->bzz = gmx_setzero_pr();
-        pbc_simd->byx = gmx_setzero_pr();
-        pbc_simd->byy = gmx_setzero_pr();
-        pbc_simd->bxx = gmx_setzero_pr();
+        pbc_simd->bzx = gmx_simd_setzero_r();
+        pbc_simd->bzy = gmx_simd_setzero_r();
+        pbc_simd->bzz = gmx_simd_setzero_r();
+        pbc_simd->byx = gmx_simd_setzero_r();
+        pbc_simd->byy = gmx_simd_setzero_r();
+        pbc_simd->bxx = gmx_simd_setzero_r();
      }
  }
  
  /* Correct distance vector *dx,*dy,*dz for PBC using SIMD */
  static gmx_inline void
-pbc_dx_simd(gmx_mm_pr *dx, gmx_mm_pr *dy, gmx_mm_pr *dz,
+pbc_dx_simd(gmx_simd_real_t *dx, gmx_simd_real_t *dy, gmx_simd_real_t *dz,
              const pbc_simd_t *pbc)
  {
-    gmx_mm_pr sh;
+    gmx_simd_real_t sh;
  
-    sh  = gmx_round_pr(gmx_mul_pr(*dz, pbc->inv_bzz));
-    *dx = gmx_nmsub_pr(sh, pbc->bzx, *dx);
-    *dy = gmx_nmsub_pr(sh, pbc->bzy, *dy);
-    *dz = gmx_nmsub_pr(sh, pbc->bzz, *dz);
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dz, pbc->inv_bzz));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->bzx, *dx);
+    *dy = gmx_simd_fnmadd_r(sh, pbc->bzy, *dy);
+    *dz = gmx_simd_fnmadd_r(sh, pbc->bzz, *dz);
  
-    sh  = gmx_round_pr(gmx_mul_pr(*dy, pbc->inv_byy));
-    *dx = gmx_nmsub_pr(sh, pbc->byx, *dx);
-    *dy = gmx_nmsub_pr(sh, pbc->byy, *dy);
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dy, pbc->inv_byy));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->byx, *dx);
+    *dy = gmx_simd_fnmadd_r(sh, pbc->byy, *dy);
  
-    sh  = gmx_round_pr(gmx_mul_pr(*dx, pbc->inv_bxx));
-    *dx = gmx_nmsub_pr(sh, pbc->bxx, *dx);
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dx, pbc->inv_bxx));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->bxx, *dx);
  }
  
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  /*
   * Morse potential bond by Frank Everdij
@@ -256,8 +209,8 @@ real morse_bonds(int nbonds,
                   const rvec x[], rvec f[], rvec fshift[],
                   const t_pbc *pbc, const t_graph *g,
                   real lambda, real *dvdlambda,
-                 const t_mdatoms *md, t_fcdata *fcd,
-                 int *global_atom_index)
+                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                 int gmx_unused *global_atom_index)
  {
      const real one = 1.0;
      const real two = 2.0;
@@ -329,9 +282,9 @@ real cubic_bonds(int nbonds,
                   const t_iatom forceatoms[], const t_iparams forceparams[],
                   const rvec x[], rvec f[], rvec fshift[],
                   const t_pbc *pbc, const t_graph *g,
-                 real lambda, real *dvdlambda,
-                 const t_mdatoms *md, t_fcdata *fcd,
-                 int *global_atom_index)
+                 real gmx_unused lambda, real gmx_unused *dvdlambda,
+                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                 int gmx_unused *global_atom_index)
  {
      const real three = 3.0;
      const real two   = 2.0;
@@ -391,8 +344,8 @@ real FENE_bonds(int nbonds,
                  const t_iatom forceatoms[], const t_iparams forceparams[],
                  const rvec x[], rvec f[], rvec fshift[],
                  const t_pbc *pbc, const t_graph *g,
-                real lambda, real *dvdlambda,
-                const t_mdatoms *md, t_fcdata *fcd,
+                real gmx_unused lambda, real gmx_unused *dvdlambda,
+                const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
                  int *global_atom_index)
  {
      const real half = 0.5;
@@ -488,8 +441,8 @@ real bonds(int nbonds,
             const rvec x[], rvec f[], rvec fshift[],
             const t_pbc *pbc, const t_graph *g,
             real lambda, real *dvdlambda,
-           const t_mdatoms *md, t_fcdata *fcd,
-           int *global_atom_index)
+           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+           int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr, dr2, fbond, vbond, fij, vtot;
@@ -550,8 +503,8 @@ real restraint_bonds(int nbonds,
                       const rvec x[], rvec f[], rvec fshift[],
                       const t_pbc *pbc, const t_graph *g,
                       real lambda, real *dvdlambda,
-                     const t_mdatoms *md, t_fcdata *fcd,
-                     int *global_atom_index)
+                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                     int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr, dr2, fbond, vbond, fij, vtot;
@@ -652,8 +605,8 @@ real polarize(int nbonds,
                const rvec x[], rvec f[], rvec fshift[],
                const t_pbc *pbc, const t_graph *g,
                real lambda, real *dvdlambda,
-              const t_mdatoms *md, t_fcdata *fcd,
-              int *global_atom_index)
+              const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+              int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr, dr2, fbond, vbond, fij, vtot, ksh;
@@ -708,8 +661,8 @@ real anharm_polarize(int nbonds,
                       const rvec x[], rvec f[], rvec fshift[],
                       const t_pbc *pbc, const t_graph *g,
                       real lambda, real *dvdlambda,
-                     const t_mdatoms *md, t_fcdata *fcd,
-                     int *global_atom_index)
+                     const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+                     int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr, dr2, fbond, vbond, fij, vtot, ksh, khyp, drcut, ddr, ddr3;
@@ -770,17 +723,18 @@ real anharm_polarize(int nbonds,
  
  real water_pol(int nbonds,
                 const t_iatom forceatoms[], const t_iparams forceparams[],
-               const rvec x[], rvec f[], rvec fshift[],
-               const t_pbc *pbc, const t_graph *g,
-               real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               const rvec x[], rvec f[], rvec gmx_unused fshift[],
+               const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+               real gmx_unused lambda, real gmx_unused *dvdlambda,
+               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+               int gmx_unused *global_atom_index)
  {
      /* This routine implements anisotropic polarizibility for water, through
       * a shell connected to a dummy with spring constant that differ in the
       * three spatial dimensions in the molecular frame.
       */
-    int  i, m, aO, aH1, aH2, aD, aS, type, type0;
+    int  i, m, aO, aH1, aH2, aD, aS, type, type0, ki;
+    ivec dt;
      rvec dOH1, dOH2, dHH, dOD, dDS, nW, kk, dx, kdx, proj;
  #ifdef DEBUG
      rvec df;
@@ -823,11 +777,11 @@ real water_pol(int nbonds,
              aS   = forceatoms[i+5];
  
              /* Compute vectors describing the water frame */
-            rvec_sub(x[aH1], x[aO], dOH1);
-            rvec_sub(x[aH2], x[aO], dOH2);
-            rvec_sub(x[aH2], x[aH1], dHH);
-            rvec_sub(x[aD], x[aO], dOD);
-            rvec_sub(x[aS], x[aD], dDS);
+            pbc_rvec_sub(pbc, x[aH1], x[aO], dOH1);
+            pbc_rvec_sub(pbc, x[aH2], x[aO], dOH2);
+            pbc_rvec_sub(pbc, x[aH2], x[aH1], dHH);
+            pbc_rvec_sub(pbc, x[aD], x[aO], dOD);
+            ki = pbc_rvec_sub(pbc, x[aS], x[aD], dDS);
              cprod(dOH1, dOH2, nW);
  
              /* Compute inverse length of normal vector
@@ -882,6 +836,13 @@ real water_pol(int nbonds,
              kdx[YY] = kk[YY]*dx[YY];
              kdx[ZZ] = kk[ZZ]*dx[ZZ];
              vtot   += iprod(dx, kdx);
+
+            if (g)
+            {
+                ivec_sub(SHIFT_IVEC(g, aS), SHIFT_IVEC(g, aD), dt);
+                ki = IVEC2IS(dt);
+            }
+
              for (m = 0; (m < DIM); m++)
              {
                  /* This is a tensor operation but written out for speed */
@@ -892,8 +853,10 @@ real water_pol(int nbonds,
  #ifdef DEBUG
                  df[m] = fij;
  #endif
-                f[aS][m] += fij;
-                f[aD][m] -= fij;
+                f[aS][m]           += fij;
+                f[aD][m]           -= fij;
+                fshift[ki][m]      += fij;
+                fshift[CENTRAL][m] -= fij;
              }
  #ifdef DEBUG
              if (debug)
@@ -945,10 +908,10 @@ static real do_1_thole(const rvec xi, const rvec xj, rvec fi, rvec fj,
  real thole_pol(int nbonds,
                 const t_iatom forceatoms[], const t_iparams forceparams[],
                 const rvec x[], rvec f[], rvec fshift[],
-               const t_pbc *pbc, const t_graph *g,
-               real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               const t_pbc *pbc, const t_graph gmx_unused *g,
+               real gmx_unused lambda, real gmx_unused *dvdlambda,
+               const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+               int gmx_unused *global_atom_index)
  {
      /* Interaction between two pairs of particles with opposite charge */
      int  i, type, a1, da1, a2, da2;
@@ -1000,8 +963,8 @@ real angles(int nbonds,
              const rvec x[], rvec f[], rvec fshift[],
              const t_pbc *pbc, const t_graph *g,
              real lambda, real *dvdlambda,
-            const t_mdatoms *md, t_fcdata *fcd,
-            int *global_atom_index)
+            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+            int gmx_unused *global_atom_index)
  {
      int  i, ai, aj, ak, t1, t2, type;
      rvec r_ij, r_kj;
@@ -1082,7 +1045,7 @@ real angles(int nbonds,
      return vtot;
  }
  
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
  
  /* As angles, but using SIMD to calculate many dihedrals at once.
   * This routines does not calculate energies and shift forces.
@@ -1091,58 +1054,62 @@ static gmx_inline void
  angles_noener_simd(int nbonds,
                     const t_iatom forceatoms[], const t_iparams forceparams[],
                     const rvec x[], rvec f[],
-                   const t_pbc *pbc, const t_graph *g,
-                   real lambda,
-                   const t_mdatoms *md, t_fcdata *fcd,
-                   int *global_atom_index)
+                   const t_pbc *pbc, const t_graph gmx_unused *g,
+                   real gmx_unused lambda,
+                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
  {
-#define UNROLL GMX_SIMD_WIDTH_HERE
-    const int      nfa1 = 4;
-    int            i, iu, s, m;
-    int            type, ai[UNROLL], aj[UNROLL], ak[UNROLL];
-    real           coeff_array[2*UNROLL+UNROLL], *coeff;
-    real           dr_array[2*DIM*UNROLL+UNROLL], *dr;
-    real           f_buf_array[6*UNROLL+UNROLL], *f_buf;
-    gmx_mm_pr      k_S, theta0_S;
-    gmx_mm_pr      rijx_S, rijy_S, rijz_S;
-    gmx_mm_pr      rkjx_S, rkjy_S, rkjz_S;
-    gmx_mm_pr      one_S;
-    gmx_mm_pr      rij_rkj_S;
-    gmx_mm_pr      nrij2_S, nrij_1_S;
-    gmx_mm_pr      nrkj2_S, nrkj_1_S;
-    gmx_mm_pr      cos_S, sin_S;
-    gmx_mm_pr      theta_S;
-    gmx_mm_pr      st_S, sth_S;
-    gmx_mm_pr      cik_S, cii_S, ckk_S;
-    gmx_mm_pr      f_ix_S, f_iy_S, f_iz_S;
-    gmx_mm_pr      f_kx_S, f_ky_S, f_kz_S;
-    pbc_simd_t     pbc_simd;
+    const int            nfa1 = 4;
+    int                  i, iu, s, m;
+    int                  type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH];
+    int                  ak[GMX_SIMD_REAL_WIDTH];
+    real                 coeff_array[2*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *coeff;
+    real                 dr_array[2*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                 f_buf_array[6*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *f_buf;
+    gmx_simd_real_t      k_S, theta0_S;
+    gmx_simd_real_t      rijx_S, rijy_S, rijz_S;
+    gmx_simd_real_t      rkjx_S, rkjy_S, rkjz_S;
+    gmx_simd_real_t      one_S;
+    gmx_simd_real_t      min_one_plus_eps_S;
+    gmx_simd_real_t      rij_rkj_S;
+    gmx_simd_real_t      nrij2_S, nrij_1_S;
+    gmx_simd_real_t      nrkj2_S, nrkj_1_S;
+    gmx_simd_real_t      cos_S, invsin_S;
+    gmx_simd_real_t      theta_S;
+    gmx_simd_real_t      st_S, sth_S;
+    gmx_simd_real_t      cik_S, cii_S, ckk_S;
+    gmx_simd_real_t      f_ix_S, f_iy_S, f_iz_S;
+    gmx_simd_real_t      f_kx_S, f_ky_S, f_kz_S;
+    pbc_simd_t           pbc_simd;
  
      /* Ensure register memory alignment */
-    coeff = gmx_simd_align_real(coeff_array);
-    dr    = gmx_simd_align_real(dr_array);
-    f_buf = gmx_simd_align_real(f_buf_array);
+    coeff = gmx_simd_align_r(coeff_array);
+    dr    = gmx_simd_align_r(dr_array);
+    f_buf = gmx_simd_align_r(f_buf_array);
  
-    set_pbc_simd(pbc,&pbc_simd);
+    set_pbc_simd(pbc, &pbc_simd);
+
+    one_S = gmx_simd_set1_r(1.0);
  
-    one_S = gmx_set1_pr(1.0);
+    /* The smallest number > -1 */
+    min_one_plus_eps_S = gmx_simd_set1_r(-1.0 + 2*GMX_REAL_EPS);
  
-    /* nbonds is the number of angles times nfa1, here we step UNROLL angles */
-    for (i = 0; (i < nbonds); i += UNROLL*nfa1)
+    /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
      {
-        /* Collect atoms for UNROLL angles.
+        /* Collect atoms for GMX_SIMD_REAL_WIDTH angles.
           * iu indexes into forceatoms, we should not let iu go beyond nbonds.
           */
          iu = i;
-        for (s = 0; s < UNROLL; s++)
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
          {
              type  = forceatoms[iu];
              ai[s] = forceatoms[iu+1];
              aj[s] = forceatoms[iu+2];
              ak[s] = forceatoms[iu+3];
  
-            coeff[s]        = forceparams[type].harmonic.krA;
-            coeff[UNROLL+s] = forceparams[type].harmonic.rA*DEG2RAD;
+            coeff[s]                     = forceparams[type].harmonic.krA;
+            coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA*DEG2RAD;
  
              /* If you can't use pbc_dx_simd below for PBC, e.g. because
               * you can't round in SIMD, use pbc_rvec_sub here.
@@ -1150,8 +1117,8 @@ angles_noener_simd(int nbonds,
              /* Store the non PBC corrected distances packed and aligned */
              for (m = 0; m < DIM; m++)
              {
-                dr[s +      m *UNROLL] = x[ai[s]][m] - x[aj[s]][m];
-                dr[s + (DIM+m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
+                dr[s +      m *GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
+                dr[s + (DIM+m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
              }
  
              /* At the end fill the arrays with identical entries */
@@ -1161,61 +1128,70 @@ angles_noener_simd(int nbonds,
              }
          }
  
-        k_S       = gmx_load_pr(coeff);
-        theta0_S  = gmx_load_pr(coeff+UNROLL);
+        k_S       = gmx_simd_load_r(coeff);
+        theta0_S  = gmx_simd_load_r(coeff+GMX_SIMD_REAL_WIDTH);
  
-        rijx_S    = gmx_load_pr(dr + 0*UNROLL);
-        rijy_S    = gmx_load_pr(dr + 1*UNROLL);
-        rijz_S    = gmx_load_pr(dr + 2*UNROLL);
-        rkjx_S    = gmx_load_pr(dr + 3*UNROLL);
-        rkjy_S    = gmx_load_pr(dr + 4*UNROLL);
-        rkjz_S    = gmx_load_pr(dr + 5*UNROLL);
+        rijx_S    = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
+        rijy_S    = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
+        rijz_S    = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
+        rkjx_S    = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
+        rkjy_S    = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
+        rkjz_S    = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
  
          pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
          pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
  
-        rij_rkj_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
-                                 rkjx_S, rkjy_S, rkjz_S);
-
-        nrij2_S   = gmx_norm2_pr(rijx_S, rijy_S, rijz_S);
-        nrkj2_S   = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
-
-        nrij_1_S  = gmx_invsqrt_pr(nrij2_S);
-        nrkj_1_S  = gmx_invsqrt_pr(nrkj2_S);
-
-        cos_S     = gmx_mul_pr(rij_rkj_S, gmx_mul_pr(nrij_1_S, nrkj_1_S));
-
-        theta_S   = gmx_acos_pr(cos_S);
-
-        sin_S     = gmx_invsqrt_pr(gmx_max_pr(gmx_sub_pr(one_S, gmx_mul_pr(cos_S, cos_S)),
-                                              gmx_setzero_pr()));
-        st_S      = gmx_mul_pr(gmx_mul_pr(k_S, gmx_sub_pr(theta0_S, theta_S)),
-                               sin_S);
-        sth_S     = gmx_mul_pr(st_S, cos_S);
-
-        cik_S     = gmx_mul_pr(st_S,  gmx_mul_pr(nrij_1_S, nrkj_1_S));
-        cii_S     = gmx_mul_pr(sth_S, gmx_mul_pr(nrij_1_S, nrij_1_S));
-        ckk_S     = gmx_mul_pr(sth_S, gmx_mul_pr(nrkj_1_S, nrkj_1_S));
-
-        f_ix_S    = gmx_mul_pr(cii_S, rijx_S);
-        f_ix_S    = gmx_nmsub_pr(cik_S, rkjx_S, f_ix_S);
-        f_iy_S    = gmx_mul_pr(cii_S, rijy_S);
-        f_iy_S    = gmx_nmsub_pr(cik_S, rkjy_S, f_iy_S);
-        f_iz_S    = gmx_mul_pr(cii_S, rijz_S);
-        f_iz_S    = gmx_nmsub_pr(cik_S, rkjz_S, f_iz_S);
-        f_kx_S    = gmx_mul_pr(ckk_S, rkjx_S);
-        f_kx_S    = gmx_nmsub_pr(cik_S, rijx_S, f_kx_S);
-        f_ky_S    = gmx_mul_pr(ckk_S, rkjy_S);
-        f_ky_S    = gmx_nmsub_pr(cik_S, rijy_S, f_ky_S);
-        f_kz_S    = gmx_mul_pr(ckk_S, rkjz_S);
-        f_kz_S    = gmx_nmsub_pr(cik_S, rijz_S, f_kz_S);
-
-        gmx_store_pr(f_buf + 0*UNROLL, f_ix_S);
-        gmx_store_pr(f_buf + 1*UNROLL, f_iy_S);
-        gmx_store_pr(f_buf + 2*UNROLL, f_iz_S);
-        gmx_store_pr(f_buf + 3*UNROLL, f_kx_S);
-        gmx_store_pr(f_buf + 4*UNROLL, f_ky_S);
-        gmx_store_pr(f_buf + 5*UNROLL, f_kz_S);
+        rij_rkj_S = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                     rkjx_S, rkjy_S, rkjz_S);
+
+        nrij2_S   = gmx_simd_norm2_r(rijx_S, rijy_S, rijz_S);
+        nrkj2_S   = gmx_simd_norm2_r(rkjx_S, rkjy_S, rkjz_S);
+
+        nrij_1_S  = gmx_simd_invsqrt_r(nrij2_S);
+        nrkj_1_S  = gmx_simd_invsqrt_r(nrkj2_S);
+
+        cos_S     = gmx_simd_mul_r(rij_rkj_S, gmx_simd_mul_r(nrij_1_S, nrkj_1_S));
+
+        /* To allow for 180 degrees, we take the max of cos and -1 + 1bit,
+         * so we can safely get the 1/sin from 1/sqrt(1 - cos^2).
+         * This also ensures that rounding errors would cause the argument
+         * of gmx_simd_acos_r to be < -1.
+         * Note that we do not take precautions for cos(0)=1, so the outer
+         * atoms in an angle should not be on top of each other.
+         */
+        cos_S     = gmx_simd_max_r(cos_S, min_one_plus_eps_S);
+
+        theta_S   = gmx_simd_acos_r(cos_S);
+
+        invsin_S  = gmx_simd_invsqrt_r(gmx_simd_sub_r(one_S, gmx_simd_mul_r(cos_S, cos_S)));
+
+        st_S      = gmx_simd_mul_r(gmx_simd_mul_r(k_S, gmx_simd_sub_r(theta0_S, theta_S)),
+                                   invsin_S);
+        sth_S     = gmx_simd_mul_r(st_S, cos_S);
+
+        cik_S     = gmx_simd_mul_r(st_S,  gmx_simd_mul_r(nrij_1_S, nrkj_1_S));
+        cii_S     = gmx_simd_mul_r(sth_S, gmx_simd_mul_r(nrij_1_S, nrij_1_S));
+        ckk_S     = gmx_simd_mul_r(sth_S, gmx_simd_mul_r(nrkj_1_S, nrkj_1_S));
+
+        f_ix_S    = gmx_simd_mul_r(cii_S, rijx_S);
+        f_ix_S    = gmx_simd_fnmadd_r(cik_S, rkjx_S, f_ix_S);
+        f_iy_S    = gmx_simd_mul_r(cii_S, rijy_S);
+        f_iy_S    = gmx_simd_fnmadd_r(cik_S, rkjy_S, f_iy_S);
+        f_iz_S    = gmx_simd_mul_r(cii_S, rijz_S);
+        f_iz_S    = gmx_simd_fnmadd_r(cik_S, rkjz_S, f_iz_S);
+        f_kx_S    = gmx_simd_mul_r(ckk_S, rkjx_S);
+        f_kx_S    = gmx_simd_fnmadd_r(cik_S, rijx_S, f_kx_S);
+        f_ky_S    = gmx_simd_mul_r(ckk_S, rkjy_S);
+        f_ky_S    = gmx_simd_fnmadd_r(cik_S, rijy_S, f_ky_S);
+        f_kz_S    = gmx_simd_mul_r(ckk_S, rkjz_S);
+        f_kz_S    = gmx_simd_fnmadd_r(cik_S, rijz_S, f_kz_S);
+
+        gmx_simd_store_r(f_buf + 0*GMX_SIMD_REAL_WIDTH, f_ix_S);
+        gmx_simd_store_r(f_buf + 1*GMX_SIMD_REAL_WIDTH, f_iy_S);
+        gmx_simd_store_r(f_buf + 2*GMX_SIMD_REAL_WIDTH, f_iz_S);
+        gmx_simd_store_r(f_buf + 3*GMX_SIMD_REAL_WIDTH, f_kx_S);
+        gmx_simd_store_r(f_buf + 4*GMX_SIMD_REAL_WIDTH, f_ky_S);
+        gmx_simd_store_r(f_buf + 5*GMX_SIMD_REAL_WIDTH, f_kz_S);
  
          iu = i;
          s  = 0;
@@ -1223,27 +1199,26 @@ angles_noener_simd(int nbonds,
          {
              for (m = 0; m < DIM; m++)
              {
-                f[ai[s]][m] += f_buf[s + m*UNROLL];
-                f[aj[s]][m] -= f_buf[s + m*UNROLL] + f_buf[s + (DIM+m)*UNROLL];
-                f[ak[s]][m] += f_buf[s + (DIM+m)*UNROLL];
+                f[ai[s]][m] += f_buf[s + m*GMX_SIMD_REAL_WIDTH];
+                f[aj[s]][m] -= f_buf[s + m*GMX_SIMD_REAL_WIDTH] + f_buf[s + (DIM+m)*GMX_SIMD_REAL_WIDTH];
+                f[ak[s]][m] += f_buf[s + (DIM+m)*GMX_SIMD_REAL_WIDTH];
              }
              s++;
              iu += nfa1;
          }
-        while (s < UNROLL && iu < nbonds);
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
      }
-#undef UNROLL
  }
  
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  real linear_angles(int nbonds,
                     const t_iatom forceatoms[], const t_iparams forceparams[],
                     const rvec x[], rvec f[], rvec fshift[],
                     const t_pbc *pbc, const t_graph *g,
                     real lambda, real *dvdlambda,
-                   const t_mdatoms *md, t_fcdata *fcd,
-                   int *global_atom_index)
+                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
  {
      int  i, m, ai, aj, ak, t1, t2, type;
      rvec f_i, f_j, f_k;
@@ -1312,8 +1287,8 @@ real urey_bradley(int nbonds,
                    const rvec x[], rvec f[], rvec fshift[],
                    const t_pbc *pbc, const t_graph *g,
                    real lambda, real *dvdlambda,
-                  const t_mdatoms *md, t_fcdata *fcd,
-                  int *global_atom_index)
+                  const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                  int gmx_unused *global_atom_index)
  {
      int  i, m, ai, aj, ak, t1, t2, type, ki;
      rvec r_ij, r_kj, r_ik;
@@ -1426,9 +1401,9 @@ real quartic_angles(int nbonds,
                      const t_iatom forceatoms[], const t_iparams forceparams[],
                      const rvec x[], rvec f[], rvec fshift[],
                      const t_pbc *pbc, const t_graph *g,
-                    real lambda, real *dvdlambda,
-                    const t_mdatoms *md, t_fcdata *fcd,
-                    int *global_atom_index)
+                    real gmx_unused lambda, real gmx_unused *dvdlambda,
+                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                    int gmx_unused *global_atom_index)
  {
      int  i, j, ai, aj, ak, t1, t2, type;
      rvec r_ij, r_kj;
@@ -1535,7 +1510,7 @@ real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
  }
  
  
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
  
  /* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
   * also calculates the pre-factor required for the dihedral force update.
@@ -1546,116 +1521,125 @@ dih_angle_simd(const rvec *x,
                 const int *ai, const int *aj, const int *ak, const int *al,
                 const pbc_simd_t *pbc,
                 real *dr,
-               gmx_mm_pr *phi_S,
-               gmx_mm_pr *mx_S, gmx_mm_pr *my_S, gmx_mm_pr *mz_S,
-               gmx_mm_pr *nx_S, gmx_mm_pr *ny_S, gmx_mm_pr *nz_S,
-               gmx_mm_pr *nrkj_m2_S,
-               gmx_mm_pr *nrkj_n2_S,
+               gmx_simd_real_t *phi_S,
+               gmx_simd_real_t *mx_S, gmx_simd_real_t *my_S, gmx_simd_real_t *mz_S,
+               gmx_simd_real_t *nx_S, gmx_simd_real_t *ny_S, gmx_simd_real_t *nz_S,
+               gmx_simd_real_t *nrkj_m2_S,
+               gmx_simd_real_t *nrkj_n2_S,
                 real *p,
                 real *q)
  {
-#define UNROLL GMX_SIMD_WIDTH_HERE
-    int       s, m;
-    gmx_mm_pr rijx_S, rijy_S, rijz_S;
-    gmx_mm_pr rkjx_S, rkjy_S, rkjz_S;
-    gmx_mm_pr rklx_S, rkly_S, rklz_S;
-    gmx_mm_pr cx_S, cy_S, cz_S;
-    gmx_mm_pr cn_S;
-    gmx_mm_pr s_S;
-    gmx_mm_pr ipr_S;
-    gmx_mm_pr iprm_S, iprn_S;
-    gmx_mm_pr nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
-    gmx_mm_pr p_S, q_S;
-    gmx_mm_pr fmin_S = gmx_set1_pr(GMX_FLOAT_MIN);
-    /* Using -0.0 should lead to only the sign bit being set */
-    gmx_mm_pr sign_mask_S = gmx_set1_pr(-0.0);
-
-    for (s = 0; s < UNROLL; s++)
+    int             s, m;
+    gmx_simd_real_t rijx_S, rijy_S, rijz_S;
+    gmx_simd_real_t rkjx_S, rkjy_S, rkjz_S;
+    gmx_simd_real_t rklx_S, rkly_S, rklz_S;
+    gmx_simd_real_t cx_S, cy_S, cz_S;
+    gmx_simd_real_t cn_S;
+    gmx_simd_real_t s_S;
+    gmx_simd_real_t ipr_S;
+    gmx_simd_real_t iprm_S, iprn_S;
+    gmx_simd_real_t nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
+    gmx_simd_real_t toler_S;
+    gmx_simd_real_t p_S, q_S;
+    gmx_simd_real_t nrkj2_min_S;
+    gmx_simd_real_t real_eps_S;
+
+    /* Used to avoid division by zero.
+     * We take into acount that we multiply the result by real_eps_S.
+     */
+    nrkj2_min_S = gmx_simd_set1_r(GMX_REAL_MIN/(2*GMX_REAL_EPS));
+
+    /* The value of the last significant bit (GMX_REAL_EPS is half of that) */
+    real_eps_S  = gmx_simd_set1_r(2*GMX_REAL_EPS);
+
+    for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
      {
          /* If you can't use pbc_dx_simd below for PBC, e.g. because
           * you can't round in SIMD, use pbc_rvec_sub here.
           */
          for (m = 0; m < DIM; m++)
          {
-            dr[s + (0*DIM + m)*UNROLL] = x[ai[s]][m] - x[aj[s]][m];
-            dr[s + (1*DIM + m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
-            dr[s + (2*DIM + m)*UNROLL] = x[ak[s]][m] - x[al[s]][m];
+            dr[s + (0*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
+            dr[s + (1*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
+            dr[s + (2*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[al[s]][m];
          }
      }
  
-    rijx_S = gmx_load_pr(dr + 0*UNROLL);
-    rijy_S = gmx_load_pr(dr + 1*UNROLL);
-    rijz_S = gmx_load_pr(dr + 2*UNROLL);
-    rkjx_S = gmx_load_pr(dr + 3*UNROLL);
-    rkjy_S = gmx_load_pr(dr + 4*UNROLL);
-    rkjz_S = gmx_load_pr(dr + 5*UNROLL);
-    rklx_S = gmx_load_pr(dr + 6*UNROLL);
-    rkly_S = gmx_load_pr(dr + 7*UNROLL);
-    rklz_S = gmx_load_pr(dr + 8*UNROLL);
+    rijx_S = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
+    rijy_S = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
+    rijz_S = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
+    rkjx_S = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
+    rkjy_S = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
+    rkjz_S = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
+    rklx_S = gmx_simd_load_r(dr + 6*GMX_SIMD_REAL_WIDTH);
+    rkly_S = gmx_simd_load_r(dr + 7*GMX_SIMD_REAL_WIDTH);
+    rklz_S = gmx_simd_load_r(dr + 8*GMX_SIMD_REAL_WIDTH);
  
      pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
      pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
      pbc_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc);
  
-    gmx_cprod_pr(rijx_S, rijy_S, rijz_S,
-                 rkjx_S, rkjy_S, rkjz_S,
-                 mx_S, my_S, mz_S);
+    gmx_simd_cprod_r(rijx_S, rijy_S, rijz_S,
+                     rkjx_S, rkjy_S, rkjz_S,
+                     mx_S, my_S, mz_S);
  
-    gmx_cprod_pr(rkjx_S, rkjy_S, rkjz_S,
-                 rklx_S, rkly_S, rklz_S,
-                 nx_S, ny_S, nz_S);
+    gmx_simd_cprod_r(rkjx_S, rkjy_S, rkjz_S,
+                     rklx_S, rkly_S, rklz_S,
+                     nx_S, ny_S, nz_S);
  
-    gmx_cprod_pr(*mx_S, *my_S, *mz_S,
-                 *nx_S, *ny_S, *nz_S,
-                 &cx_S, &cy_S, &cz_S);
+    gmx_simd_cprod_r(*mx_S, *my_S, *mz_S,
+                     *nx_S, *ny_S, *nz_S,
+                     &cx_S, &cy_S, &cz_S);
  
-    cn_S       = gmx_sqrt_pr(gmx_norm2_pr(cx_S, cy_S, cz_S));
+    cn_S       = gmx_simd_sqrt_r(gmx_simd_norm2_r(cx_S, cy_S, cz_S));
  
-    s_S        = gmx_iprod_pr(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
+    s_S        = gmx_simd_iprod_r(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
  
      /* Determine the dihedral angle, the sign might need correction */
-    *phi_S     = gmx_atan2_pr(cn_S, s_S);
+    *phi_S     = gmx_simd_atan2_r(cn_S, s_S);
  
-    ipr_S      = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
-                              *nx_S, *ny_S, *nz_S);
+    ipr_S      = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                  *nx_S, *ny_S, *nz_S);
  
-    iprm_S     = gmx_norm2_pr(*mx_S, *my_S, *mz_S);
-    iprn_S     = gmx_norm2_pr(*nx_S, *ny_S, *nz_S);
+    iprm_S     = gmx_simd_norm2_r(*mx_S, *my_S, *mz_S);
+    iprn_S     = gmx_simd_norm2_r(*nx_S, *ny_S, *nz_S);
  
-    nrkj2_S    = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
+    nrkj2_S    = gmx_simd_norm2_r(rkjx_S, rkjy_S, rkjz_S);
  
      /* Avoid division by zero. When zero, the result is multiplied by 0
       * anyhow, so the 3 max below do not affect the final result.
       */
-    nrkj2_S    = gmx_max_pr(nrkj2_S, fmin_S);
-    nrkj_1_S   = gmx_invsqrt_pr(nrkj2_S);
-    nrkj_2_S   = gmx_mul_pr(nrkj_1_S, nrkj_1_S);
-    nrkj_S     = gmx_mul_pr(nrkj2_S, nrkj_1_S);
-
-    iprm_S     = gmx_max_pr(iprm_S, fmin_S);
-    iprn_S     = gmx_max_pr(iprn_S, fmin_S);
-    *nrkj_m2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprm_S));
-    *nrkj_n2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprn_S));
-
-    /* Set sign of the angle with the sign of ipr_S.
-     * Since phi is currently positive, we can use OR instead of XOR.
-     */
-    *phi_S     = gmx_or_pr(*phi_S, gmx_and_pr(ipr_S, sign_mask_S));
-
-    p_S        = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
-                              rkjx_S, rkjy_S, rkjz_S);
-    p_S        = gmx_mul_pr(p_S, nrkj_2_S);
+    nrkj2_S    = gmx_simd_max_r(nrkj2_S, nrkj2_min_S);
+    nrkj_1_S   = gmx_simd_invsqrt_r(nrkj2_S);
+    nrkj_2_S   = gmx_simd_mul_r(nrkj_1_S, nrkj_1_S);
+    nrkj_S     = gmx_simd_mul_r(nrkj2_S, nrkj_1_S);
  
-    q_S        = gmx_iprod_pr(rklx_S, rkly_S, rklz_S,
-                              rkjx_S, rkjy_S, rkjz_S);
-    q_S        = gmx_mul_pr(q_S, nrkj_2_S);
+    toler_S    = gmx_simd_mul_r(nrkj2_S, real_eps_S);
  
-    gmx_store_pr(p, p_S);
-    gmx_store_pr(q, q_S);
-#undef UNROLL
+    /* Here the plain-C code uses a conditional, but we can't do that in SIMD.
+     * So we take a max with the tolerance instead. Since we multiply with
+     * m or n later, the max does not affect the results.
+     */
+    iprm_S     = gmx_simd_max_r(iprm_S, toler_S);
+    iprn_S     = gmx_simd_max_r(iprn_S, toler_S);
+    *nrkj_m2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprm_S));
+    *nrkj_n2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprn_S));
+
+    /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
+    *phi_S     = gmx_simd_xor_sign_r(*phi_S, ipr_S);
+    p_S        = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                  rkjx_S, rkjy_S, rkjz_S);
+    p_S        = gmx_simd_mul_r(p_S, nrkj_2_S);
+
+    q_S        = gmx_simd_iprod_r(rklx_S, rkly_S, rklz_S,
+                                  rkjx_S, rkjy_S, rkjz_S);
+    q_S        = gmx_simd_mul_r(q_S, nrkj_2_S);
+
+    gmx_simd_store_r(p, p_S);
+    gmx_simd_store_r(q, q_S);
  }
  
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  
  void do_dih_fup(int i, int j, int k, int l, real ddphi,
@@ -1882,8 +1866,8 @@ real pdihs(int nbonds,
             const rvec x[], rvec f[], rvec fshift[],
             const t_pbc *pbc, const t_graph *g,
             real lambda, real *dvdlambda,
-           const t_mdatoms *md, t_fcdata *fcd,
-           int *global_atom_index)
+           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+           int gmx_unused *global_atom_index)
  {
      int  i, type, ai, aj, ak, al;
      int  t1, t2, t3;
@@ -1941,10 +1925,10 @@ static void
  pdihs_noener(int nbonds,
               const t_iatom forceatoms[], const t_iparams forceparams[],
               const rvec x[], rvec f[],
-             const t_pbc *pbc, const t_graph *g,
+             const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
               real lambda,
-             const t_mdatoms *md, t_fcdata *fcd,
-             int *global_atom_index)
+             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+             int gmx_unused *global_atom_index)
  {
      int  i, type, ai, aj, ak, al;
      int  t1, t2, t3;
@@ -1990,60 +1974,58 @@ pdihs_noener(int nbonds,
  }
  
  
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
  
  /* As pdihs_noner above, but using SIMD to calculate many dihedrals at once */
  static void
  pdihs_noener_simd(int nbonds,
                    const t_iatom forceatoms[], const t_iparams forceparams[],
                    const rvec x[], rvec f[],
-                  const t_pbc *pbc, const t_graph *g,
-                  real lambda,
-                  const t_mdatoms *md, t_fcdata *fcd,
-                  int *global_atom_index)
+                  const t_pbc *pbc, const t_graph gmx_unused *g,
+                  real gmx_unused lambda,
+                  const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                  int gmx_unused *global_atom_index)
  {
-#define UNROLL GMX_SIMD_WIDTH_HERE
-    const int      nfa1 = 5;
-    int            i, iu, s;
-    int            type, ai[UNROLL], aj[UNROLL], ak[UNROLL], al[UNROLL];
-    int            t1[UNROLL], t2[UNROLL], t3[UNROLL];
-    real           ddphi;
-    real           dr_array[3*DIM*UNROLL+UNROLL], *dr;
-    real           buf_array[7*UNROLL+UNROLL], *buf;
-    real           *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
-    gmx_mm_pr      phi0_S, phi_S;
-    gmx_mm_pr      mx_S, my_S, mz_S;
-    gmx_mm_pr      nx_S, ny_S, nz_S;
-    gmx_mm_pr      nrkj_m2_S, nrkj_n2_S;
-    gmx_mm_pr      cp_S, mdphi_S, mult_S;
-    gmx_mm_pr      sin_S, cos_S;
-    gmx_mm_pr      mddphi_S;
-    gmx_mm_pr      sf_i_S, msf_l_S;
-    pbc_simd_t     pbc_simd;
+    const int             nfa1 = 5;
+    int                   i, iu, s;
+    int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
+    real                  ddphi;
+    real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                  buf_array[7*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
+    real                 *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
+    gmx_simd_real_t       phi0_S, phi_S;
+    gmx_simd_real_t       mx_S, my_S, mz_S;
+    gmx_simd_real_t       nx_S, ny_S, nz_S;
+    gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+    gmx_simd_real_t       cp_S, mdphi_S, mult_S;
+    gmx_simd_real_t       sin_S, cos_S;
+    gmx_simd_real_t       mddphi_S;
+    gmx_simd_real_t       sf_i_S, msf_l_S;
+    pbc_simd_t            pbc_simd;
  
      /* Ensure SIMD register alignment */
-    dr  = gmx_simd_align_real(dr_array);
-    buf = gmx_simd_align_real(buf_array);
+    dr  = gmx_simd_align_r(dr_array);
+    buf = gmx_simd_align_r(buf_array);
  
      /* Extract aligned pointer for parameters and variables */
-    cp    = buf + 0*UNROLL;
-    phi0  = buf + 1*UNROLL;
-    mult  = buf + 2*UNROLL;
-    p     = buf + 3*UNROLL;
-    q     = buf + 4*UNROLL;
-    sf_i  = buf + 5*UNROLL;
-    msf_l = buf + 6*UNROLL;
+    cp    = buf + 0*GMX_SIMD_REAL_WIDTH;
+    phi0  = buf + 1*GMX_SIMD_REAL_WIDTH;
+    mult  = buf + 2*GMX_SIMD_REAL_WIDTH;
+    p     = buf + 3*GMX_SIMD_REAL_WIDTH;
+    q     = buf + 4*GMX_SIMD_REAL_WIDTH;
+    sf_i  = buf + 5*GMX_SIMD_REAL_WIDTH;
+    msf_l = buf + 6*GMX_SIMD_REAL_WIDTH;
  
      set_pbc_simd(pbc, &pbc_simd);
  
-    /* nbonds is the number of dihedrals times nfa1, here we step UNROLL dihs */
-    for (i = 0; (i < nbonds); i += UNROLL*nfa1)
+    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
      {
-        /* Collect atoms quadruplets for UNROLL dihedrals.
+        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
           * iu indexes into forceatoms, we should not let iu go beyond nbonds.
           */
          iu = i;
-        for (s = 0; s < UNROLL; s++)
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
          {
              type  = forceatoms[iu];
              ai[s] = forceatoms[iu+1];
@@ -2062,7 +2044,7 @@ pdihs_noener_simd(int nbonds,
              }
          }
  
-        /* Caclulate UNROLL dihedral angles at once */
+        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
          dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
                         dr,
                         &phi_S,
@@ -2072,34 +2054,34 @@ pdihs_noener_simd(int nbonds,
                         &nrkj_n2_S,
                         p, q);
  
-        cp_S     = gmx_load_pr(cp);
-        phi0_S   = gmx_load_pr(phi0);
-        mult_S   = gmx_load_pr(mult);
+        cp_S     = gmx_simd_load_r(cp);
+        phi0_S   = gmx_simd_load_r(phi0);
+        mult_S   = gmx_simd_load_r(mult);
  
-        mdphi_S  = gmx_sub_pr(gmx_mul_pr(mult_S, phi_S), phi0_S);
+        mdphi_S  = gmx_simd_sub_r(gmx_simd_mul_r(mult_S, phi_S), phi0_S);
  
-        /* Calculate UNROLL sines at once */
-        gmx_sincos_pr(mdphi_S, &sin_S, &cos_S);
-        mddphi_S = gmx_mul_pr(gmx_mul_pr(cp_S, mult_S), sin_S);
-        sf_i_S   = gmx_mul_pr(mddphi_S, nrkj_m2_S);
-        msf_l_S  = gmx_mul_pr(mddphi_S, nrkj_n2_S);
+        /* Calculate GMX_SIMD_REAL_WIDTH sines at once */
+        gmx_simd_sincos_r(mdphi_S, &sin_S, &cos_S);
+        mddphi_S = gmx_simd_mul_r(gmx_simd_mul_r(cp_S, mult_S), sin_S);
+        sf_i_S   = gmx_simd_mul_r(mddphi_S, nrkj_m2_S);
+        msf_l_S  = gmx_simd_mul_r(mddphi_S, nrkj_n2_S);
  
          /* After this m?_S will contain f[i] */
-        mx_S     = gmx_mul_pr(sf_i_S, mx_S);
-        my_S     = gmx_mul_pr(sf_i_S, my_S);
-        mz_S     = gmx_mul_pr(sf_i_S, mz_S);
+        mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+        my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+        mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
  
          /* After this m?_S will contain -f[l] */
-        nx_S     = gmx_mul_pr(msf_l_S, nx_S);
-        ny_S     = gmx_mul_pr(msf_l_S, ny_S);
-        nz_S     = gmx_mul_pr(msf_l_S, nz_S);
+        nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+        ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+        nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
  
-        gmx_store_pr(dr + 0*UNROLL, mx_S);
-        gmx_store_pr(dr + 1*UNROLL, my_S);
-        gmx_store_pr(dr + 2*UNROLL, mz_S);
-        gmx_store_pr(dr + 3*UNROLL, nx_S);
-        gmx_store_pr(dr + 4*UNROLL, ny_S);
-        gmx_store_pr(dr + 5*UNROLL, nz_S);
+        gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+        gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+        gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+        gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+        gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+        gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
  
          iu = i;
          s  = 0;
@@ -2107,22 +2089,170 @@ pdihs_noener_simd(int nbonds,
          {
              do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
                                          p[s], q[s],
-                                        dr[     XX *UNROLL+s],
-                                        dr[     YY *UNROLL+s],
-                                        dr[     ZZ *UNROLL+s],
-                                        dr[(DIM+XX)*UNROLL+s],
-                                        dr[(DIM+YY)*UNROLL+s],
-                                        dr[(DIM+ZZ)*UNROLL+s],
+                                        dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
                                          f);
              s++;
              iu += nfa1;
          }
-        while (s < UNROLL && iu < nbonds);
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
      }
-#undef UNROLL
  }
  
-#endif /* SIMD_BONDEDS */
+/* This is mostly a copy of pdihs_noener_simd above, but with using
+ * the RB potential instead of a harmonic potential.
+ * This function can replace rbdihs() when no energy and virial are needed.
+ */
+static void
+rbdihs_noener_simd(int nbonds,
+                   const t_iatom forceatoms[], const t_iparams forceparams[],
+                   const rvec x[], rvec f[],
+                   const t_pbc *pbc, const t_graph gmx_unused *g,
+                   real gmx_unused lambda,
+                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
+{
+    const int             nfa1 = 5;
+    int                   i, iu, s, j;
+    int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
+    real                  ddphi;
+    real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                  buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
+    real                 *parm, *phi, *p, *q, *sf_i, *msf_l;
+
+    gmx_simd_real_t       phi_S;
+    gmx_simd_real_t       ddphi_S, cosfac_S;
+    gmx_simd_real_t       mx_S, my_S, mz_S;
+    gmx_simd_real_t       nx_S, ny_S, nz_S;
+    gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+    gmx_simd_real_t       parm_S, c_S;
+    gmx_simd_real_t       sin_S, cos_S;
+    gmx_simd_real_t       sf_i_S, msf_l_S;
+    pbc_simd_t            pbc_simd;
+
+    gmx_simd_real_t       pi_S  = gmx_simd_set1_r(M_PI);
+    gmx_simd_real_t       one_S = gmx_simd_set1_r(1.0);
+
+    /* Ensure SIMD register alignment */
+    dr  = gmx_simd_align_r(dr_array);
+    buf = gmx_simd_align_r(buf_array);
+
+    /* Extract aligned pointer for parameters and variables */
+    parm  = buf;
+    p     = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH;
+    q     = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH;
+    sf_i  = buf + (NR_RBDIHS + 2)*GMX_SIMD_REAL_WIDTH;
+    msf_l = buf + (NR_RBDIHS + 3)*GMX_SIMD_REAL_WIDTH;
+
+    set_pbc_simd(pbc, &pbc_simd);
+
+    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+    {
+        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+         */
+        iu = i;
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+        {
+            type  = forceatoms[iu];
+            ai[s] = forceatoms[iu+1];
+            aj[s] = forceatoms[iu+2];
+            ak[s] = forceatoms[iu+3];
+            al[s] = forceatoms[iu+4];
+
+            /* We don't need the first parameter, since that's a constant
+             * which only affects the energies, not the forces.
+             */
+            for (j = 1; j < NR_RBDIHS; j++)
+            {
+                parm[j*GMX_SIMD_REAL_WIDTH + s] =
+                    forceparams[type].rbdihs.rbcA[j];
+            }
+
+            /* At the end fill the arrays with identical entries */
+            if (iu + nfa1 < nbonds)
+            {
+                iu += nfa1;
+            }
+        }
+
+        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+        dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+                       dr,
+                       &phi_S,
+                       &mx_S, &my_S, &mz_S,
+                       &nx_S, &ny_S, &nz_S,
+                       &nrkj_m2_S,
+                       &nrkj_n2_S,
+                       p, q);
+
+        /* Change to polymer convention */
+        phi_S = gmx_simd_sub_r(phi_S, pi_S);
+
+        gmx_simd_sincos_r(phi_S, &sin_S, &cos_S);
+
+        ddphi_S   = gmx_simd_setzero_r();
+        c_S       = one_S;
+        cosfac_S  = one_S;
+        for (j = 1; j < NR_RBDIHS; j++)
+        {
+            parm_S   = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH);
+            ddphi_S  = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S);
+            cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S);
+            c_S      = gmx_simd_add_r(c_S, one_S);
+        }
+
+        /* Note that here we do not use the minus sign which is present
+         * in the normal RB code. This is corrected for through (m)sf below.
+         */
+        ddphi_S  = gmx_simd_mul_r(ddphi_S, sin_S);
+
+        sf_i_S   = gmx_simd_mul_r(ddphi_S, nrkj_m2_S);
+        msf_l_S  = gmx_simd_mul_r(ddphi_S, nrkj_n2_S);
+
+        /* After this m?_S will contain f[i] */
+        mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+        my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+        mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
+
+        /* After this m?_S will contain -f[l] */
+        nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+        ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+        nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
+
+        gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+        gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+        gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+        gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+        gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+        gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+
+        iu = i;
+        s  = 0;
+        do
+        {
+            do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+                                        p[s], q[s],
+                                        dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
+                                        f);
+            s++;
+            iu += nfa1;
+        }
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+    }
+}
+
+#endif /* GMX_SIMD_HAVE_REAL */
  
  
  real idihs(int nbonds,
@@ -2130,8 +2260,8 @@ real idihs(int nbonds,
             const rvec x[], rvec f[], rvec fshift[],
             const t_pbc *pbc, const t_graph *g,
             real lambda, real *dvdlambda,
-           const t_mdatoms *md, t_fcdata *fcd,
-           int *global_atom_index)
+           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+           int gmx_unused *global_atom_index)
  {
      int  i, type, ai, aj, ak, al;
      int  t1, t2, t3;
@@ -2228,6 +2358,7 @@ static void posres_dx(const rvec x, const rvec pos0A, const rvec pos0B,
                      /* Box relative coordinates are stored for dimensions with pbc */
                      posA *= pbc->box[m][m];
                      posB *= pbc->box[m][m];
+                    assert(npbcdim <= DIM);
                      for (d = m+1; d < npbcdim; d++)
                      {
                          posA += pos0A[d]*pbc->box[d][m];
@@ -2292,6 +2423,7 @@ real fbposres(int nbonds,
          clear_rvec(com_sc);
          for (m = 0; m < npbcdim; m++)
          {
+            assert(npbcdim <= DIM);
              for (d = m; d < npbcdim; d++)
              {
                  com_sc[m] += com[d]*pbc->box[d][m];
@@ -2417,6 +2549,7 @@ real posres(int nbonds,
          clear_rvec(comB_sc);
          for (m = 0; m < npbcdim; m++)
          {
+            assert(npbcdim <= DIM);
              for (d = m; d < npbcdim; d++)
              {
                  comA_sc[m] += comA[d]*pbc->box[d][m];
@@ -2447,7 +2580,7 @@ real posres(int nbonds,
              vtot       += 0.5*kk*dx[m]*dx[m];
              *dvdlambda +=
                  0.5*(pr->posres.fcB[m] - pr->posres.fcA[m])*dx[m]*dx[m]
-                -fm*dpdl[m];
+                + fm*dpdl[m];
  
              /* Here we correct for the pbc_dx which included rdist */
              if (bForceValid)
@@ -2563,8 +2696,8 @@ real angres(int nbonds,
              const rvec x[], rvec f[], rvec fshift[],
              const t_pbc *pbc, const t_graph *g,
              real lambda, real *dvdlambda,
-            const t_mdatoms *md, t_fcdata *fcd,
-            int *global_atom_index)
+            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+            int gmx_unused *global_atom_index)
  {
      return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
                        lambda, dvdlambda, FALSE);
@@ -2575,8 +2708,8 @@ real angresz(int nbonds,
               const rvec x[], rvec f[], rvec fshift[],
               const t_pbc *pbc, const t_graph *g,
               real lambda, real *dvdlambda,
-             const t_mdatoms *md, t_fcdata *fcd,
-             int *global_atom_index)
+             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+             int gmx_unused *global_atom_index)
  {
      return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
                        lambda, dvdlambda, TRUE);
@@ -2587,8 +2720,8 @@ real dihres(int nbonds,
              const rvec x[], rvec f[], rvec fshift[],
              const t_pbc *pbc, const t_graph *g,
              real lambda, real *dvdlambda,
-            const t_mdatoms *md, t_fcdata *fcd,
-            int *global_atom_index)
+            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+            int gmx_unused  *global_atom_index)
  {
      real vtot = 0;
      int  ai, aj, ak, al, i, k, type, t1, t2, t3;
@@ -2677,26 +2810,346 @@ real dihres(int nbonds,
  }
  
  
-real unimplemented(int nbonds,
-                   const t_iatom forceatoms[], const t_iparams forceparams[],
-                   const rvec x[], rvec f[], rvec fshift[],
-                   const t_pbc *pbc, const t_graph *g,
-                   real lambda, real *dvdlambda,
-                   const t_mdatoms *md, t_fcdata *fcd,
-                   int *global_atom_index)
+real unimplemented(int gmx_unused nbonds,
+                   const t_iatom gmx_unused forceatoms[], const t_iparams gmx_unused forceparams[],
+                   const rvec gmx_unused x[], rvec gmx_unused f[], rvec gmx_unused fshift[],
+                   const t_pbc gmx_unused *pbc, const t_graph  gmx_unused *g,
+                   real gmx_unused lambda, real gmx_unused *dvdlambda,
+                   const t_mdatoms  gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
  {
      gmx_impl("*** you are using a not implemented function");
  
      return 0.0; /* To make the compiler happy */
  }
  
+real restrangles(int nbonds,
+                 const t_iatom forceatoms[], const t_iparams forceparams[],
+                 const rvec x[], rvec f[], rvec fshift[],
+                 const t_pbc *pbc, const t_graph *g,
+                 real gmx_unused lambda, real gmx_unused *dvdlambda,
+                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                 int gmx_unused *global_atom_index)
+{
+    int  i, d, ai, aj, ak, type, m;
+    int  t1, t2;
+    rvec r_ij, r_kj;
+    real v, vtot;
+    ivec jt, dt_ij, dt_kj;
+    rvec f_i, f_j, f_k;
+    real prefactor, ratio_ante, ratio_post;
+    rvec delta_ante, delta_post, vec_temp;
+
+    vtot = 0.0;
+    for (i = 0; (i < nbonds); )
+    {
+        type = forceatoms[i++];
+        ai   = forceatoms[i++];
+        aj   = forceatoms[i++];
+        ak   = forceatoms[i++];
+
+        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_post);
+
+
+        /* This function computes factors needed for restricted angle potential.
+         * The restricted angle potential is used in coarse-grained simulations to avoid singularities
+         * when three particles align and the dihedral angle and dihedral potential
+         * cannot be calculated. This potential is calculated using the formula:
+           real restrangles(int nbonds,
+            const t_iatom forceatoms[],const t_iparams forceparams[],
+            const rvec x[],rvec f[],rvec fshift[],
+            const t_pbc *pbc,const t_graph *g,
+            real gmx_unused lambda,real gmx_unused *dvdlambda,
+            const t_mdatoms gmx_unused *md,t_fcdata gmx_unused *fcd,
+            int gmx_unused *global_atom_index)
+           {
+           int  i, d, ai, aj, ak, type, m;
+           int t1, t2;
+           rvec r_ij,r_kj;
+           real v, vtot;
+           ivec jt,dt_ij,dt_kj;
+           rvec f_i, f_j, f_k;
+           real prefactor, ratio_ante, ratio_post;
+           rvec delta_ante, delta_post, vec_temp;
+
+           vtot = 0.0;
+           for(i=0; (i<nbonds); )
+           {
+           type = forceatoms[i++];
+           ai   = forceatoms[i++];
+           aj   = forceatoms[i++];
+           ak   = forceatoms[i++];
+
+         * \f[V_{\rm ReB}(\theta_i) = \frac{1}{2} k_{\theta} \frac{(\cos\theta_i - \cos\theta_0)^2}
+         * {\sin^2\theta_i}\f] ({eq:ReB} and ref \cite{MonicaGoga2013} from the manual).
+         * For more explanations see comments file "restcbt.h". */
+
+        compute_factors_restangles(type, forceparams,  delta_ante, delta_post,
+                                   &prefactor, &ratio_ante, &ratio_post, &v);
+
+        /*   Forces are computed per component */
+        for (d = 0; d < DIM; d++)
+        {
+            f_i[d] = prefactor * (ratio_ante * delta_ante[d] - delta_post[d]);
+            f_j[d] = prefactor * ((ratio_post + 1.0) * delta_post[d] - (ratio_ante + 1.0) * delta_ante[d]);
+            f_k[d] = prefactor * (delta_ante[d] - ratio_post * delta_post[d]);
+        }
+
+        /*   Computation of potential energy   */
+
+        vtot += v;
+
+        /*   Update forces */
+
+        for (m = 0; (m < DIM); m++)
+        {
+            f[ai][m] += f_i[m];
+            f[aj][m] += f_j[m];
+            f[ak][m] += f_k[m];
+        }
+
+        if (g)
+        {
+            copy_ivec(SHIFT_IVEC(g, aj), jt);
+            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+            t1 = IVEC2IS(dt_ij);
+            t2 = IVEC2IS(dt_kj);
+        }
+
+        rvec_inc(fshift[t1], f_i);
+        rvec_inc(fshift[CENTRAL], f_j);
+        rvec_inc(fshift[t2], f_k);
+    }
+    return vtot;
+}
+
+
+real restrdihs(int nbonds,
+               const t_iatom forceatoms[], const t_iparams forceparams[],
+               const rvec x[], rvec f[], rvec fshift[],
+               const t_pbc *pbc, const t_graph *g,
+               real gmx_unused lambda, real gmx_unused *dvlambda,
+               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+               int gmx_unused *global_atom_index)
+{
+    int  i, d, type, ai, aj, ak, al;
+    rvec f_i, f_j, f_k, f_l;
+    rvec dx_jl;
+    ivec jt, dt_ij, dt_kj, dt_lj;
+    int  t1, t2, t3;
+    real v, vtot;
+    rvec delta_ante,  delta_crnt, delta_post, vec_temp;
+    real factor_phi_ai_ante, factor_phi_ai_crnt, factor_phi_ai_post;
+    real factor_phi_aj_ante, factor_phi_aj_crnt, factor_phi_aj_post;
+    real factor_phi_ak_ante, factor_phi_ak_crnt, factor_phi_ak_post;
+    real factor_phi_al_ante, factor_phi_al_crnt, factor_phi_al_post;
+    real prefactor_phi;
+
+
+    vtot = 0.0;
+    for (i = 0; (i < nbonds); )
+    {
+        type = forceatoms[i++];
+        ai   = forceatoms[i++];
+        aj   = forceatoms[i++];
+        ak   = forceatoms[i++];
+        al   = forceatoms[i++];
+
+        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
+        t3 = pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+        pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
+
+        /* This function computes factors needed for restricted angle potential.
+         * The restricted angle potential is used in coarse-grained simulations to avoid singularities
+         * when three particles align and the dihedral angle and dihedral potential cannot be calculated.
+         * This potential is calculated using the formula:
+         * \f[V_{\rm ReB}(\theta_i) = \frac{1}{2} k_{\theta}
+         * \frac{(\cos\theta_i - \cos\theta_0)^2}{\sin^2\theta_i}\f]
+         * ({eq:ReB} and ref \cite{MonicaGoga2013} from the manual).
+         * For more explanations see comments file "restcbt.h" */
+
+        compute_factors_restrdihs(type, forceparams,
+                                  delta_ante, delta_crnt, delta_post,
+                                  &factor_phi_ai_ante, &factor_phi_ai_crnt, &factor_phi_ai_post,
+                                  &factor_phi_aj_ante, &factor_phi_aj_crnt, &factor_phi_aj_post,
+                                  &factor_phi_ak_ante, &factor_phi_ak_crnt, &factor_phi_ak_post,
+                                  &factor_phi_al_ante, &factor_phi_al_crnt, &factor_phi_al_post,
+                                  &prefactor_phi, &v);
+
+
+        /*      Computation of forces per component */
+        for (d = 0; d < DIM; d++)
+        {
+            f_i[d] = prefactor_phi * (factor_phi_ai_ante * delta_ante[d] + factor_phi_ai_crnt * delta_crnt[d] + factor_phi_ai_post * delta_post[d]);
+            f_j[d] = prefactor_phi * (factor_phi_aj_ante * delta_ante[d] + factor_phi_aj_crnt * delta_crnt[d] + factor_phi_aj_post * delta_post[d]);
+            f_k[d] = prefactor_phi * (factor_phi_ak_ante * delta_ante[d] + factor_phi_ak_crnt * delta_crnt[d] + factor_phi_ak_post * delta_post[d]);
+            f_l[d] = prefactor_phi * (factor_phi_al_ante * delta_ante[d] + factor_phi_al_crnt * delta_crnt[d] + factor_phi_al_post * delta_post[d]);
+        }
+        /*      Computation of the energy */
+
+        vtot += v;
+
+
+
+        /*    Updating the forces */
+
+        rvec_inc(f[ai], f_i);
+        rvec_inc(f[aj], f_j);
+        rvec_inc(f[ak], f_k);
+        rvec_inc(f[al], f_l);
+
+
+        /* Updating the fshift forces for the pressure coupling */
+        if (g)
+        {
+            copy_ivec(SHIFT_IVEC(g, aj), jt);
+            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+            ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
+            t1 = IVEC2IS(dt_ij);
+            t2 = IVEC2IS(dt_kj);
+            t3 = IVEC2IS(dt_lj);
+        }
+        else if (pbc)
+        {
+            t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
+        }
+        else
+        {
+            t3 = CENTRAL;
+        }
+
+        rvec_inc(fshift[t1], f_i);
+        rvec_inc(fshift[CENTRAL], f_j);
+        rvec_inc(fshift[t2], f_k);
+        rvec_inc(fshift[t3], f_l);
+
+    }
+
+    return vtot;
+}
+
+
+real cbtdihs(int nbonds,
+             const t_iatom forceatoms[], const t_iparams forceparams[],
+             const rvec x[], rvec f[], rvec fshift[],
+             const t_pbc *pbc, const t_graph *g,
+             real gmx_unused lambda, real gmx_unused *dvdlambda,
+             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+             int gmx_unused *global_atom_index)
+{
+    int  type, ai, aj, ak, al, i, d;
+    int  t1, t2, t3;
+    real v, vtot;
+    rvec vec_temp;
+    rvec f_i, f_j, f_k, f_l;
+    ivec jt, dt_ij, dt_kj, dt_lj;
+    rvec dx_jl;
+    rvec delta_ante, delta_crnt, delta_post;
+    rvec f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al;
+    rvec f_theta_ante_ai, f_theta_ante_aj, f_theta_ante_ak;
+    rvec f_theta_post_aj, f_theta_post_ak, f_theta_post_al;
+
+
+
+
+    vtot = 0.0;
+    for (i = 0; (i < nbonds); )
+    {
+        type = forceatoms[i++];
+        ai   = forceatoms[i++];
+        aj   = forceatoms[i++];
+        ak   = forceatoms[i++];
+        al   = forceatoms[i++];
+
+
+        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], vec_temp);
+        pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
+        t3 = pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+        pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
+
+        /* \brief Compute factors for CBT potential
+         * The combined bending-torsion potential goes to zero in a very smooth manner, eliminating the numerical
+         * instabilities, when three coarse-grained particles align and the dihedral angle and standard
+         * dihedral potentials cannot be calculated. The CBT potential is calculated using the formula:
+         * \f[V_{\rm CBT}(\theta_{i-1}, \theta_i, \phi_i) = k_{\phi} \sin^3\theta_{i-1} \sin^3\theta_{i}
+         * \sum_{n=0}^4 { a_n \cos^n\phi_i}\f] ({eq:CBT} and ref \cite{MonicaGoga2013} from the manual).
+         * It contains in its expression not only the dihedral angle \f$\phi\f$
+         * but also \f[\theta_{i-1}\f] (theta_ante bellow) and \f[\theta_{i}\f] (theta_post bellow)
+         * --- the adjacent bending angles.
+         * For more explanations see comments file "restcbt.h". */
+
+        compute_factors_cbtdihs(type, forceparams, delta_ante, delta_crnt, delta_post,
+                                f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al,
+                                f_theta_ante_ai, f_theta_ante_aj, f_theta_ante_ak,
+                                f_theta_post_aj, f_theta_post_ak, f_theta_post_al,
+                                &v);
+
+
+        /*      Acumulate the resuts per beads */
+        for (d = 0; d < DIM; d++)
+        {
+            f_i[d] = f_phi_ai[d] + f_theta_ante_ai[d];
+            f_j[d] = f_phi_aj[d] + f_theta_ante_aj[d] + f_theta_post_aj[d];
+            f_k[d] = f_phi_ak[d] + f_theta_ante_ak[d] + f_theta_post_ak[d];
+            f_l[d] = f_phi_al[d] + f_theta_post_al[d];
+        }
+
+        /*      Compute the potential energy */
+
+        vtot += v;
+
+
+        /*  Updating the forces */
+        rvec_inc(f[ai], f_i);
+        rvec_inc(f[aj], f_j);
+        rvec_inc(f[ak], f_k);
+        rvec_inc(f[al], f_l);
+
+
+        /* Updating the fshift forces for the pressure coupling */
+        if (g)
+        {
+            copy_ivec(SHIFT_IVEC(g, aj), jt);
+            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+            ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
+            t1 = IVEC2IS(dt_ij);
+            t2 = IVEC2IS(dt_kj);
+            t3 = IVEC2IS(dt_lj);
+        }
+        else if (pbc)
+        {
+            t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
+        }
+        else
+        {
+            t3 = CENTRAL;
+        }
+
+        rvec_inc(fshift[t1], f_i);
+        rvec_inc(fshift[CENTRAL], f_j);
+        rvec_inc(fshift[t2], f_k);
+        rvec_inc(fshift[t3], f_l);
+    }
+
+    return vtot;
+}
+
  real rbdihs(int nbonds,
              const t_iatom forceatoms[], const t_iparams forceparams[],
              const rvec x[], rvec f[], rvec fshift[],
              const t_pbc *pbc, const t_graph *g,
              real lambda, real *dvdlambda,
-            const t_mdatoms *md, t_fcdata *fcd,
-            int *global_atom_index)
+            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+            int gmx_unused *global_atom_index)
  {
      const real c0 = 0.0, c1 = 1.0, c2 = 2.0, c3 = 3.0, c4 = 4.0, c5 = 5.0;
      int        type, ai, aj, ak, al, i, j;
@@ -2838,9 +3291,9 @@ real cmap_dihs(int nbonds,
                 const gmx_cmap_t *cmap_grid,
                 const rvec x[], rvec f[], rvec fshift[],
                 const t_pbc *pbc, const t_graph *g,
-               real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               real gmx_unused lambda, real gmx_unused *dvdlambda,
+               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+               int  gmx_unused *global_atom_index)
  {
      int         i, j, k, n, idx;
      int         ai, aj, ak, al, am;
@@ -3278,8 +3731,8 @@ real g96bonds(int nbonds,
                const rvec x[], rvec f[], rvec fshift[],
                const t_pbc *pbc, const t_graph *g,
                real lambda, real *dvdlambda,
-              const t_mdatoms *md, t_fcdata *fcd,
-              int *global_atom_index)
+              const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+              int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr2, fbond, vbond, fij, vtot;
@@ -3348,8 +3801,8 @@ real g96angles(int nbonds,
                 const rvec x[], rvec f[], rvec fshift[],
                 const t_pbc *pbc, const t_graph *g,
                 real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+               int gmx_unused *global_atom_index)
  {
      int  i, ai, aj, ak, type, m, t1, t2;
      rvec r_ij, r_kj;
@@ -3419,9 +3872,9 @@ real cross_bond_bond(int nbonds,
                       const t_iatom forceatoms[], const t_iparams forceparams[],
                       const rvec x[], rvec f[], rvec fshift[],
                       const t_pbc *pbc, const t_graph *g,
-                     real lambda, real *dvdlambda,
-                     const t_mdatoms *md, t_fcdata *fcd,
-                     int *global_atom_index)
+                     real gmx_unused lambda, real gmx_unused *dvdlambda,
+                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused  *fcd,
+                     int gmx_unused *global_atom_index)
  {
      /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
       * pp. 842-847
@@ -3493,9 +3946,9 @@ real cross_bond_angle(int nbonds,
                        const t_iatom forceatoms[], const t_iparams forceparams[],
                        const rvec x[], rvec f[], rvec fshift[],
                        const t_pbc *pbc, const t_graph *g,
-                      real lambda, real *dvdlambda,
-                      const t_mdatoms *md, t_fcdata *fcd,
-                      int *global_atom_index)
+                      real gmx_unused lambda, real gmx_unused *dvdlambda,
+                      const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                      int gmx_unused *global_atom_index)
  {
      /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
       * pp. 842-847
@@ -3618,8 +4071,8 @@ real tab_bonds(int nbonds,
                 const rvec x[], rvec f[], rvec fshift[],
                 const t_pbc *pbc, const t_graph *g,
                 real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+               int gmx_unused  *global_atom_index)
  {
      int  i, m, ki, ai, aj, type, table;
      real dr, dr2, fbond, vbond, fij, vtot;
@@ -3682,8 +4135,8 @@ real tab_angles(int nbonds,
                  const rvec x[], rvec f[], rvec fshift[],
                  const t_pbc *pbc, const t_graph *g,
                  real lambda, real *dvdlambda,
-                const t_mdatoms *md, t_fcdata *fcd,
-                int *global_atom_index)
+                const t_mdatoms gmx_unused  *md, t_fcdata *fcd,
+                int gmx_unused *global_atom_index)
  {
      int  i, ai, aj, ak, t1, t2, type, table;
      rvec r_ij, r_kj;
@@ -3766,8 +4219,8 @@ real tab_dihs(int nbonds,
                const rvec x[], rvec f[], rvec fshift[],
                const t_pbc *pbc, const t_graph *g,
                real lambda, real *dvdlambda,
-              const t_mdatoms *md, t_fcdata *fcd,
-              int *global_atom_index)
+              const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+              int gmx_unused *global_atom_index)
  {
      int  i, type, ai, aj, ak, al, table;
      int  t1, t2, t3;
@@ -3808,6 +4261,73 @@ real tab_dihs(int nbonds,
      return vtot;
  }
  
+/* Return if this is a potential calculated in bondfree.c,
+ * i.e. an interaction that actually calculates a potential and
+ * works on multiple atoms (not e.g. a connection or a position restraint).
+ */
+static gmx_inline gmx_bool ftype_is_bonded_potential(int ftype)
+{
+    return
+        (interaction_function[ftype].flags & IF_BOND) &&
+        !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES) &&
+        (ftype < F_GB12 || ftype > F_GB14);
+}
+
+static void divide_bondeds_over_threads(t_idef *idef, int nthreads)
+{
+    int ftype;
+    int nat1;
+    int t;
+    int il_nr_thread;
+
+    idef->nthreads = nthreads;
+
+    if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc)
+    {
+        idef->il_thread_division_nalloc = F_NRE*(nthreads+1);
+        snew(idef->il_thread_division, idef->il_thread_division_nalloc);
+    }
+
+    for (ftype = 0; ftype < F_NRE; ftype++)
+    {
+        if (ftype_is_bonded_potential(ftype))
+        {
+            nat1 = interaction_function[ftype].nratoms + 1;
+
+            for (t = 0; t <= nthreads; t++)
+            {
+                /* Divide the interactions equally over the threads.
+                 * When the different types of bonded interactions
+                 * are distributed roughly equally over the threads,
+                 * this should lead to well localized output into
+                 * the force buffer on each thread.
+                 * If this is not the case, a more advanced scheme
+                 * (not implemented yet) will do better.
+                 */
+                il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1;
+
+                /* Ensure that distance restraint pairs with the same label
+                 * end up on the same thread.
+                 * This is slighlty tricky code, since the next for iteration
+                 * may have an initial il_nr_thread lower than the final value
+                 * in the previous iteration, but this will anyhow be increased
+                 * to the approriate value again by this while loop.
+                 */
+                while (ftype == F_DISRES &&
+                       il_nr_thread > 0 &&
+                       il_nr_thread < idef->il[ftype].nr &&
+                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label ==
+                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label)
+                {
+                    il_nr_thread += nat1;
+                }
+
+                idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread;
+            }
+        }
+    }
+}
+
  static unsigned
  calc_bonded_reduction_mask(const t_idef *idef,
                             int shift,
@@ -3820,9 +4340,7 @@ calc_bonded_reduction_mask(const t_idef *idef,
  
      for (ftype = 0; ftype < F_NRE; ftype++)
      {
-        if (interaction_function[ftype].flags & IF_BOND &&
-            !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
-            (ftype<F_GB12 || ftype>F_GB14))
+        if (ftype_is_bonded_potential(ftype))
          {
              nb = idef->il[ftype].nr;
              if (nb > 0)
@@ -3832,8 +4350,8 @@ calc_bonded_reduction_mask(const t_idef *idef,
                  /* Divide this interaction equally over the threads.
                   * This is not stored: should match division in calc_bonds.
                   */
-                nb0 = (((nb/nat1)* t   )/nt)*nat1;
-                nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
+                nb0 = idef->il_thread_division[ftype*(nt+1)+t];
+                nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
  
                  for (i = nb0; i < nb1; i += nat1)
                  {
@@ -3849,14 +4367,18 @@ calc_bonded_reduction_mask(const t_idef *idef,
      return mask;
  }
  
-void init_bonded_thread_force_reduction(t_forcerec   *fr,
-                                        const t_idef *idef)
+void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
  {
  #define MAX_BLOCK_BITS 32
      int t;
      int ctot, c, b;
  
-    if (fr->nthreads <= 1)
+    assert(fr->nthreads >= 1);
+
+    /* Divide the bonded interaction over the threads */
+    divide_bondeds_over_threads(idef, fr->nthreads);
+
+    if (fr->nthreads == 1)
      {
          fr->red_nblock = 0;
  
@@ -4080,14 +4602,14 @@ static real calc_one_bond(FILE *fplog, int thread,
                            rvec x[], rvec f[], rvec fshift[],
                            t_forcerec *fr,
                            const t_pbc *pbc, const t_graph *g,
-                          gmx_enerdata_t *enerd, gmx_grppairener_t *grpp,
+                          gmx_grppairener_t *grpp,
                            t_nrnb *nrnb,
                            real *lambda, real *dvdl,
                            const t_mdatoms *md, t_fcdata *fcd,
                            gmx_bool bCalcEnerVir,
                            int *global_atom_index, gmx_bool bPrintSepPot)
  {
-    int      ind, nat1, nbonds, efptFTYPE;
+    int      nat1, nbonds, efptFTYPE;
      real     v = 0;
      t_iatom *iatoms;
      int      nb0, nbn;
@@ -4101,170 +4623,101 @@ static real calc_one_bond(FILE *fplog, int thread,
          efptFTYPE = efptBONDED;
      }
  
-    if (interaction_function[ftype].flags & IF_BOND &&
-        !(ftype == F_CONNBONDS || ftype == F_POSRES))
-    {
-        ind       = interaction_function[ftype].nrnb_ind;
-        nat1      = interaction_function[ftype].nratoms + 1;
-        nbonds    = idef->il[ftype].nr/nat1;
-        iatoms    = idef->il[ftype].iatoms;
+    nat1      = interaction_function[ftype].nratoms + 1;
+    nbonds    = idef->il[ftype].nr/nat1;
+    iatoms    = idef->il[ftype].iatoms;
  
-        nb0 = ((nbonds* thread   )/(fr->nthreads))*nat1;
-        nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
+    nb0 = idef->il_thread_division[ftype*(idef->nthreads+1)+thread];
+    nbn = idef->il_thread_division[ftype*(idef->nthreads+1)+thread+1] - nb0;
  
-        if (!IS_LISTED_LJ_C(ftype))
+    if (!IS_LISTED_LJ_C(ftype))
+    {
+        if (ftype == F_CMAP)
          {
-            if (ftype == F_CMAP)
-            {
-                v = cmap_dihs(nbn, iatoms+nb0,
-                              idef->iparams, &idef->cmap_grid,
-                              (const rvec*)x, f, fshift,
-                              pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
-                              md, fcd, global_atom_index);
-            }
-#ifdef SIMD_BONDEDS
-            else if (ftype == F_ANGLES &&
-                     !bCalcEnerVir && fr->efep == efepNO)
-            {
-                /* No energies, shift forces, dvdl */
-                angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
-                                   idef->iparams,
-                                   (const rvec*)x, f,
-                                   pbc, g, lambda[efptFTYPE], md, fcd,
-                                   global_atom_index);
-                v = 0;
-            }
+            v = cmap_dihs(nbn, iatoms+nb0,
+                          idef->iparams, &idef->cmap_grid,
+                          (const rvec*)x, f, fshift,
+                          pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+                          md, fcd, global_atom_index);
+        }
+#ifdef GMX_SIMD_HAVE_REAL
+        else if (ftype == F_ANGLES &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+            angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                               idef->iparams,
+                               (const rvec*)x, f,
+                               pbc, g, lambda[efptFTYPE], md, fcd,
+                               global_atom_index);
+            v = 0;
+        }
  #endif
-            else if (ftype == F_PDIHS &&
-                     !bCalcEnerVir && fr->efep == efepNO)
-            {
-                /* No energies, shift forces, dvdl */
-#ifndef SIMD_BONDEDS
-                pdihs_noener
+        else if (ftype == F_PDIHS &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+#ifdef GMX_SIMD_HAVE_REAL
+            pdihs_noener_simd
  #else
-                pdihs_noener_simd
+            pdihs_noener
  #endif
-                    (nbn, idef->il[ftype].iatoms+nb0,
-                    idef->iparams,
-                    (const rvec*)x, f,
-                    pbc, g, lambda[efptFTYPE], md, fcd,
-                    global_atom_index);
-                v = 0;
-            }
-            else
-            {
-                v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
-                                                      idef->iparams,
-                                                      (const rvec*)x, f, fshift,
-                                                      pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
-                                                      md, fcd, global_atom_index);
-            }
-            if (bPrintSepPot)
-            {
-                fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        nbonds/nat1, v, lambda[efptFTYPE]);
-            }
+                (nbn, idef->il[ftype].iatoms+nb0,
+                idef->iparams,
+                (const rvec*)x, f,
+                pbc, g, lambda[efptFTYPE], md, fcd,
+                global_atom_index);
+            v = 0;
          }
+#ifdef GMX_SIMD_HAVE_REAL
+        else if (ftype == F_RBDIHS &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+            rbdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                               idef->iparams,
+                               (const rvec*)x, f,
+                               pbc, g, lambda[efptFTYPE], md, fcd,
+                               global_atom_index);
+            v = 0;
+        }
+#endif
          else
          {
-            v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
-                                    pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
-
-            if (bPrintSepPot)
-            {
-                fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        interaction_function[F_LJ14].longname, nbonds/nat1, dvdl[efptVDW]);
-                fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        interaction_function[F_COUL14].longname, nbonds/nat1, dvdl[efptCOUL]);
-            }
+            v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
+                                                  idef->iparams,
+                                                  (const rvec*)x, f, fshift,
+                                                  pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+                                                  md, fcd, global_atom_index);
          }
-        if (ind != -1 && thread == 0)
+        if (bPrintSepPot)
          {
-            inc_nrnb(nrnb, ind, nbonds);
+            fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    nbonds, v, lambda[efptFTYPE]);
          }
      }
-
-    return v;
-}
-
-/* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc
-   function, or horrible things will happen when doing free energy
-   calculations!  In a good coding world, this would not be a
-   different function, but for speed reasons, it needs to be made a
-   separate function.  TODO for 5.0 - figure out a way to reorganize
-   to reduce duplication.
- */
-
-static real calc_one_bond_foreign(FILE *fplog, int ftype, const t_idef *idef,
-                                  rvec x[], rvec f[], t_forcerec *fr,
-                                  const t_pbc *pbc, const t_graph *g,
-                                  gmx_grppairener_t *grpp, t_nrnb *nrnb,
-                                  real *lambda, real *dvdl,
-                                  const t_mdatoms *md, t_fcdata *fcd,
-                                  int *global_atom_index, gmx_bool bPrintSepPot)
-{
-    int      ind, nat1, nbonds, efptFTYPE, nbonds_np;
-    real     v = 0;
-    t_iatom *iatoms;
-
-    if (IS_RESTRAINT_TYPE(ftype))
-    {
-        efptFTYPE = efptRESTRAINT;
-    }
      else
      {
-        efptFTYPE = efptBONDED;
+        v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
+                                pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
+
+        if (bPrintSepPot)
+        {
+            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    interaction_function[F_LJ14].longname, nbonds, dvdl[efptVDW]);
+            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    interaction_function[F_COUL14].longname, nbonds, dvdl[efptCOUL]);
+        }
      }
  
-    if (ftype < F_GB12 || ftype > F_GB14)
+    if (thread == 0)
      {
-        if (interaction_function[ftype].flags & IF_BOND &&
-            !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES))
-        {
-            ind       = interaction_function[ftype].nrnb_ind;
-            nat1      = interaction_function[ftype].nratoms+1;
-            nbonds_np = idef->il[ftype].nr_nonperturbed;
-            nbonds    = idef->il[ftype].nr - nbonds_np;
-            iatoms    = idef->il[ftype].iatoms + nbonds_np;
-            if (nbonds > 0)
-            {
-                if (!IS_LISTED_LJ_C(ftype))
-                {
-                    if (ftype == F_CMAP)
-                    {
-                        v = cmap_dihs(nbonds, iatoms,
-                                      idef->iparams, &idef->cmap_grid,
-                                      (const rvec*)x, f, fr->fshift,
-                                      pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]), md, fcd,
-                                      global_atom_index);
-                    }
-                    else
-                    {
-                        v =     interaction_function[ftype].ifunc(nbonds, iatoms,
-                                                                  idef->iparams,
-                                                                  (const rvec*)x, f, fr->fshift,
-                                                                  pbc, g, lambda[efptFTYPE], &dvdl[efptFTYPE],
-                                                                  md, fcd, global_atom_index);
-                    }
-                }
-                else
-                {
-                    v = do_nonbonded_listed(ftype, nbonds, iatoms,
-                                            idef->iparams,
-                                            (const rvec*)x, f, fr->fshift,
-                                            pbc, g, lambda, dvdl,
-                                            md, fr, grpp, global_atom_index);
-                }
-                if (ind != -1)
-                {
-                    inc_nrnb(nrnb, ind, nbonds/nat1);
-                }
-            }
-        }
+        inc_nrnb(nrnb, interaction_function[ftype].nrnb_ind, nbonds);
      }
+
      return v;
  }
  
@@ -4277,9 +4730,9 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
                  real *lambda,
                  const t_mdatoms *md,
                  t_fcdata *fcd, int *global_atom_index,
-                t_atomtypes *atype, gmx_genborn_t *born,
+                t_atomtypes gmx_unused *atype, gmx_genborn_t gmx_unused *born,
                  int force_flags,
-                gmx_bool bPrintSepPot, gmx_large_int_t step)
+                gmx_bool bPrintSepPot, gmx_int64_t step)
  {
      gmx_bool      bCalcEnerVir;
      int           i;
@@ -4289,6 +4742,8 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
      char          buf[22];
      int           thread;
  
+    assert(fr->nthreads == idef->nthreads);
+
      bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
  
      for (i = 0; i < efptNR; i++)
@@ -4305,7 +4760,7 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
      }
      if (bPrintSepPot)
      {
-        fprintf(fplog, "Step %s: bonded V and dVdl for this node\n",
+        fprintf(fplog, "Step %s: bonded V and dVdl for this rank\n",
                  gmx_step_str(step, buf));
      }
  
@@ -4327,22 +4782,27 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
      }
      if (idef->il[F_DISRES].nr)
      {
-        calc_disres_R_6(ms, idef->il[F_DISRES].nr,
+        calc_disres_R_6(idef->il[F_DISRES].nr,
                          idef->il[F_DISRES].iatoms,
                          idef->iparams, (const rvec*)x, pbc_null,
                          fcd, hist);
+#ifdef GMX_MPI
+        if (fcd->disres.nsystems > 1)
+        {
+            gmx_sum_sim(2*fcd->disres.nres, fcd->disres.Rt_6, ms);
+        }
+#endif
      }
  
  #pragma omp parallel for num_threads(fr->nthreads) schedule(static)
      for (thread = 0; thread < fr->nthreads; thread++)
      {
-        int                ftype, nbonds, ind, nat1;
+        int                ftype;
          real              *epot, v;
          /* thread stuff */
          rvec              *ft, *fshift;
          real              *dvdlt;
          gmx_grppairener_t *grpp;
-        int                nb0, nbn;
  
          if (thread == 0)
          {
@@ -4366,17 +4826,14 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
          /* Loop over all bonded force types to calculate the bonded forces */
          for (ftype = 0; (ftype < F_NRE); ftype++)
          {
-            if (idef->il[ftype].nr > 0 &&
-                (interaction_function[ftype].flags & IF_BOND) &&
-                (ftype < F_GB12 || ftype > F_GB14) &&
-                !(ftype == F_CONNBONDS || ftype == F_POSRES))
+            if (idef->il[ftype].nr > 0 && ftype_is_bonded_potential(ftype))
              {
                  v = calc_one_bond(fplog, thread, ftype, idef, x,
-                                  ft, fshift, fr, pbc_null, g, enerd, grpp,
+                                  ft, fshift, fr, pbc_null, g, grpp,
                                    nrnb, lambda, dvdlt,
                                    md, fcd, bCalcEnerVir,
                                    global_atom_index, bPrintSepPot);
-                epot[ftype]        += v;
+                epot[ftype] += v;
              }
          }
      }
@@ -4416,12 +4873,12 @@ void calc_bonds_lambda(FILE *fplog,
                         t_fcdata *fcd,
                         int *global_atom_index)
  {
-    int           i, ftype, nbonds_np, nbonds, ind, nat;
-    real          v, dr, dr2;
+    int           i, ftype, nr_nonperturbed, nr;
+    real          v;
      real          dvdl_dum[efptNR];
-    rvec         *f, *fshift_orig;
+    rvec         *f, *fshift;
      const  t_pbc *pbc_null;
-    t_iatom      *iatom_fe;
+    t_idef        idef_fe;
  
      if (fr->bMolPBC)
      {
@@ -4432,21 +4889,43 @@ void calc_bonds_lambda(FILE *fplog,
          pbc_null = NULL;
      }
  
+    /* Copy the whole idef, so we can modify the contents locally */
+    idef_fe          = *idef;
+    idef_fe.nthreads = 1;
+    snew(idef_fe.il_thread_division, F_NRE*(idef_fe.nthreads+1));
+
+    /* We already have the forces, so we use temp buffers here */
      snew(f, fr->natoms_force);
-    /* We want to preserve the fshift array in forcerec */
-    fshift_orig = fr->fshift;
-    snew(fr->fshift, SHIFTS);
+    snew(fshift, SHIFTS);
  
-    /* Loop over all bonded force types to calculate the bonded forces */
+    /* Loop over all bonded force types to calculate the bonded energies */
      for (ftype = 0; (ftype < F_NRE); ftype++)
      {
-        v = calc_one_bond_foreign(fplog, ftype, idef, x,
-                                  f, fr, pbc_null, g, grpp, nrnb, lambda, dvdl_dum,
-                                  md, fcd, global_atom_index, FALSE);
-        epot[ftype] += v;
+        if (ftype_is_bonded_potential(ftype))
+        {
+            /* Set the work range of thread 0 to the perturbed bondeds only */
+            nr_nonperturbed                       = idef->il[ftype].nr_nonperturbed;
+            nr                                    = idef->il[ftype].nr;
+            idef_fe.il_thread_division[ftype*2+0] = nr_nonperturbed;
+            idef_fe.il_thread_division[ftype*2+1] = nr;
+
+            /* This is only to get the flop count correct */
+            idef_fe.il[ftype].nr = nr - nr_nonperturbed;
+
+            if (nr - nr_nonperturbed > 0)
+            {
+                v = calc_one_bond(fplog, 0, ftype, &idef_fe,
+                                  x, f, fshift, fr, pbc_null, g,
+                                  grpp, nrnb, lambda, dvdl_dum,
+                                  md, fcd, TRUE,
+                                  global_atom_index, FALSE);
+                epot[ftype] += v;
+            }
+        }
      }
  
-    sfree(fr->fshift);
-    fr->fshift = fshift_orig;
+    sfree(fshift);
      sfree(f);
+
+    sfree(idef_fe.il_thread_division);
  }