SIMD acceleration for RB dihedrals

[alexxy/gromacs.git] / src / gromacs / gmxlib / bondfree.c
diff --git a/src/gromacs/gmxlib/bondfree.c b/src/gromacs/gmxlib/bondfree.c

index b3bc55dddfb18ecc27a26f4e03bcdd0d43825b53..0eb8d55791b983c0b5909786b7dffdb1c5ef462a 100644 (file)
--- a/src/gromacs/gmxlib/bondfree.c
+++ b/src/gromacs/gmxlib/bondfree.c
@@ -1,49 +1,51 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
- *
- *
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
- *
- *          GROningen MAchine for Chemical Simulations
+/*
+ * This file is part of the GROMACS molecular simulation package.
   *
- *                        VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
   * of the License, or (at your option) any later version.
   *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
   *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
   *
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
   *
- * And Hey:
- * GROningen Mixture of Alchemy and Childrens' Stories
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
   */
  #ifdef HAVE_CONFIG_H
  #include <config.h>
  #endif
  
  #include <math.h>
+#include <assert.h>
  #include "physics.h"
  #include "vec.h"
-#include "maths.h"
+#include "gromacs/math/utilities.h"
  #include "txtdump.h"
  #include "bondf.h"
-#include "smalloc.h"
+#include "gromacs/utility/smalloc.h"
  #include "pbc.h"
  #include "ns.h"
  #include "macros.h"
@@ -55,11 +57,11 @@
  #include "orires.h"
  #include "force.h"
  #include "nonbonded.h"
+#include "restcbt.h"
  
-#if !defined GMX_DOUBLE && defined GMX_X86_SSE2
-#include "gmx_x86_simd_single.h"
-#define SSE_PROPER_DIHEDRALS
-#endif
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/simd/vector_operations.h"
  
  /* Find a better place for this? */
  const int cmap_coeff_matrix[] = {
@@ -112,6 +114,83 @@ static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
      }
  }
  
+#ifdef GMX_SIMD_HAVE_REAL
+
+/* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
+typedef struct {
+    gmx_simd_real_t inv_bzz;
+    gmx_simd_real_t inv_byy;
+    gmx_simd_real_t inv_bxx;
+    gmx_simd_real_t bzx;
+    gmx_simd_real_t bzy;
+    gmx_simd_real_t bzz;
+    gmx_simd_real_t byx;
+    gmx_simd_real_t byy;
+    gmx_simd_real_t bxx;
+} pbc_simd_t;
+
+/* Set the SIMD pbc data from a normal t_pbc struct */
+static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
+{
+    rvec inv_bdiag;
+    int  d;
+
+    /* Setting inv_bdiag to 0 effectively turns off PBC */
+    clear_rvec(inv_bdiag);
+    if (pbc != NULL)
+    {
+        for (d = 0; d < pbc->ndim_ePBC; d++)
+        {
+            inv_bdiag[d] = 1.0/pbc->box[d][d];
+        }
+    }
+
+    pbc_simd->inv_bzz = gmx_simd_set1_r(inv_bdiag[ZZ]);
+    pbc_simd->inv_byy = gmx_simd_set1_r(inv_bdiag[YY]);
+    pbc_simd->inv_bxx = gmx_simd_set1_r(inv_bdiag[XX]);
+
+    if (pbc != NULL)
+    {
+        pbc_simd->bzx = gmx_simd_set1_r(pbc->box[ZZ][XX]);
+        pbc_simd->bzy = gmx_simd_set1_r(pbc->box[ZZ][YY]);
+        pbc_simd->bzz = gmx_simd_set1_r(pbc->box[ZZ][ZZ]);
+        pbc_simd->byx = gmx_simd_set1_r(pbc->box[YY][XX]);
+        pbc_simd->byy = gmx_simd_set1_r(pbc->box[YY][YY]);
+        pbc_simd->bxx = gmx_simd_set1_r(pbc->box[XX][XX]);
+    }
+    else
+    {
+        pbc_simd->bzx = gmx_simd_setzero_r();
+        pbc_simd->bzy = gmx_simd_setzero_r();
+        pbc_simd->bzz = gmx_simd_setzero_r();
+        pbc_simd->byx = gmx_simd_setzero_r();
+        pbc_simd->byy = gmx_simd_setzero_r();
+        pbc_simd->bxx = gmx_simd_setzero_r();
+    }
+}
+
+/* Correct distance vector *dx,*dy,*dz for PBC using SIMD */
+static gmx_inline void
+pbc_dx_simd(gmx_simd_real_t *dx, gmx_simd_real_t *dy, gmx_simd_real_t *dz,
+            const pbc_simd_t *pbc)
+{
+    gmx_simd_real_t sh;
+
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dz, pbc->inv_bzz));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->bzx, *dx);
+    *dy = gmx_simd_fnmadd_r(sh, pbc->bzy, *dy);
+    *dz = gmx_simd_fnmadd_r(sh, pbc->bzz, *dz);
+
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dy, pbc->inv_byy));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->byx, *dx);
+    *dy = gmx_simd_fnmadd_r(sh, pbc->byy, *dy);
+
+    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dx, pbc->inv_bxx));
+    *dx = gmx_simd_fnmadd_r(sh, pbc->bxx, *dx);
+}
+
+#endif /* GMX_SIMD_HAVE_REAL */
+
  /*
   * Morse potential bond by Frank Everdij
   *
@@ -130,8 +209,8 @@ real morse_bonds(int nbonds,
                   const rvec x[], rvec f[], rvec fshift[],
                   const t_pbc *pbc, const t_graph *g,
                   real lambda, real *dvdlambda,
-                 const t_mdatoms *md, t_fcdata *fcd,
-                 int *global_atom_index)
+                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                 int gmx_unused *global_atom_index)
  {
      const real one = 1.0;
      const real two = 2.0;
@@ -203,9 +282,9 @@ real cubic_bonds(int nbonds,
                   const t_iatom forceatoms[], const t_iparams forceparams[],
                   const rvec x[], rvec f[], rvec fshift[],
                   const t_pbc *pbc, const t_graph *g,
-                 real lambda, real *dvdlambda,
-                 const t_mdatoms *md, t_fcdata *fcd,
-                 int *global_atom_index)
+                 real gmx_unused lambda, real gmx_unused *dvdlambda,
+                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                 int gmx_unused *global_atom_index)
  {
      const real three = 3.0;
      const real two   = 2.0;
@@ -265,8 +344,8 @@ real FENE_bonds(int nbonds,
                  const t_iatom forceatoms[], const t_iparams forceparams[],
                  const rvec x[], rvec f[], rvec fshift[],
                  const t_pbc *pbc, const t_graph *g,
-                real lambda, real *dvdlambda,
-                const t_mdatoms *md, t_fcdata *fcd,
+                real gmx_unused lambda, real gmx_unused *dvdlambda,
+                const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
                  int *global_atom_index)
  {
      const real half = 0.5;
@@ -362,8 +441,8 @@ real bonds(int nbonds,
             const rvec x[], rvec f[], rvec fshift[],
             const t_pbc *pbc, const t_graph *g,
             real lambda, real *dvdlambda,
-           const t_mdatoms *md, t_fcdata *fcd,
-           int *global_atom_index)
+           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+           int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr, dr2, fbond, vbond, fij, vtot;
@@ -424,8 +503,8 @@ real restraint_bonds(int nbonds,
                       const rvec x[], rvec f[], rvec fshift[],
                       const t_pbc *pbc, const t_graph *g,
                       real lambda, real *dvdlambda,
-                     const t_mdatoms *md, t_fcdata *fcd,
-                     int *global_atom_index)
+                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                     int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr, dr2, fbond, vbond, fij, vtot;
@@ -526,8 +605,8 @@ real polarize(int nbonds,
                const rvec x[], rvec f[], rvec fshift[],
                const t_pbc *pbc, const t_graph *g,
                real lambda, real *dvdlambda,
-              const t_mdatoms *md, t_fcdata *fcd,
-              int *global_atom_index)
+              const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+              int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr, dr2, fbond, vbond, fij, vtot, ksh;
@@ -582,8 +661,8 @@ real anharm_polarize(int nbonds,
                       const rvec x[], rvec f[], rvec fshift[],
                       const t_pbc *pbc, const t_graph *g,
                       real lambda, real *dvdlambda,
-                     const t_mdatoms *md, t_fcdata *fcd,
-                     int *global_atom_index)
+                     const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+                     int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr, dr2, fbond, vbond, fij, vtot, ksh, khyp, drcut, ddr, ddr3;
@@ -644,17 +723,18 @@ real anharm_polarize(int nbonds,
  
  real water_pol(int nbonds,
                 const t_iatom forceatoms[], const t_iparams forceparams[],
-               const rvec x[], rvec f[], rvec fshift[],
-               const t_pbc *pbc, const t_graph *g,
-               real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               const rvec x[], rvec f[], rvec gmx_unused fshift[],
+               const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+               real gmx_unused lambda, real gmx_unused *dvdlambda,
+               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+               int gmx_unused *global_atom_index)
  {
      /* This routine implements anisotropic polarizibility for water, through
       * a shell connected to a dummy with spring constant that differ in the
       * three spatial dimensions in the molecular frame.
       */
-    int  i, m, aO, aH1, aH2, aD, aS, type, type0;
+    int  i, m, aO, aH1, aH2, aD, aS, type, type0, ki;
+    ivec dt;
      rvec dOH1, dOH2, dHH, dOD, dDS, nW, kk, dx, kdx, proj;
  #ifdef DEBUG
      rvec df;
@@ -697,11 +777,11 @@ real water_pol(int nbonds,
              aS   = forceatoms[i+5];
  
              /* Compute vectors describing the water frame */
-            rvec_sub(x[aH1], x[aO], dOH1);
-            rvec_sub(x[aH2], x[aO], dOH2);
-            rvec_sub(x[aH2], x[aH1], dHH);
-            rvec_sub(x[aD], x[aO], dOD);
-            rvec_sub(x[aS], x[aD], dDS);
+            pbc_rvec_sub(pbc, x[aH1], x[aO], dOH1);
+            pbc_rvec_sub(pbc, x[aH2], x[aO], dOH2);
+            pbc_rvec_sub(pbc, x[aH2], x[aH1], dHH);
+            pbc_rvec_sub(pbc, x[aD], x[aO], dOD);
+            ki = pbc_rvec_sub(pbc, x[aS], x[aD], dDS);
              cprod(dOH1, dOH2, nW);
  
              /* Compute inverse length of normal vector
@@ -756,6 +836,13 @@ real water_pol(int nbonds,
              kdx[YY] = kk[YY]*dx[YY];
              kdx[ZZ] = kk[ZZ]*dx[ZZ];
              vtot   += iprod(dx, kdx);
+
+            if (g)
+            {
+                ivec_sub(SHIFT_IVEC(g, aS), SHIFT_IVEC(g, aD), dt);
+                ki = IVEC2IS(dt);
+            }
+
              for (m = 0; (m < DIM); m++)
              {
                  /* This is a tensor operation but written out for speed */
@@ -766,8 +853,10 @@ real water_pol(int nbonds,
  #ifdef DEBUG
                  df[m] = fij;
  #endif
-                f[aS][m] += fij;
-                f[aD][m] -= fij;
+                f[aS][m]           += fij;
+                f[aD][m]           -= fij;
+                fshift[ki][m]      += fij;
+                fshift[CENTRAL][m] -= fij;
              }
  #ifdef DEBUG
              if (debug)
@@ -819,10 +908,10 @@ static real do_1_thole(const rvec xi, const rvec xj, rvec fi, rvec fj,
  real thole_pol(int nbonds,
                 const t_iatom forceatoms[], const t_iparams forceparams[],
                 const rvec x[], rvec f[], rvec fshift[],
-               const t_pbc *pbc, const t_graph *g,
-               real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               const t_pbc *pbc, const t_graph gmx_unused *g,
+               real gmx_unused lambda, real gmx_unused *dvdlambda,
+               const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+               int gmx_unused *global_atom_index)
  {
      /* Interaction between two pairs of particles with opposite charge */
      int  i, type, a1, da1, a2, da2;
@@ -874,8 +963,8 @@ real angles(int nbonds,
              const rvec x[], rvec f[], rvec fshift[],
              const t_pbc *pbc, const t_graph *g,
              real lambda, real *dvdlambda,
-            const t_mdatoms *md, t_fcdata *fcd,
-            int *global_atom_index)
+            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+            int gmx_unused *global_atom_index)
  {
      int  i, ai, aj, ak, t1, t2, type;
      rvec r_ij, r_kj;
@@ -956,13 +1045,180 @@ real angles(int nbonds,
      return vtot;
  }
  
+#ifdef GMX_SIMD_HAVE_REAL
+
+/* As angles, but using SIMD to calculate many dihedrals at once.
+ * This routines does not calculate energies and shift forces.
+ */
+static gmx_inline void
+angles_noener_simd(int nbonds,
+                   const t_iatom forceatoms[], const t_iparams forceparams[],
+                   const rvec x[], rvec f[],
+                   const t_pbc *pbc, const t_graph gmx_unused *g,
+                   real gmx_unused lambda,
+                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
+{
+    const int            nfa1 = 4;
+    int                  i, iu, s, m;
+    int                  type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH];
+    int                  ak[GMX_SIMD_REAL_WIDTH];
+    real                 coeff_array[2*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *coeff;
+    real                 dr_array[2*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                 f_buf_array[6*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *f_buf;
+    gmx_simd_real_t      k_S, theta0_S;
+    gmx_simd_real_t      rijx_S, rijy_S, rijz_S;
+    gmx_simd_real_t      rkjx_S, rkjy_S, rkjz_S;
+    gmx_simd_real_t      one_S;
+    gmx_simd_real_t      min_one_plus_eps_S;
+    gmx_simd_real_t      rij_rkj_S;
+    gmx_simd_real_t      nrij2_S, nrij_1_S;
+    gmx_simd_real_t      nrkj2_S, nrkj_1_S;
+    gmx_simd_real_t      cos_S, invsin_S;
+    gmx_simd_real_t      theta_S;
+    gmx_simd_real_t      st_S, sth_S;
+    gmx_simd_real_t      cik_S, cii_S, ckk_S;
+    gmx_simd_real_t      f_ix_S, f_iy_S, f_iz_S;
+    gmx_simd_real_t      f_kx_S, f_ky_S, f_kz_S;
+    pbc_simd_t           pbc_simd;
+
+    /* Ensure register memory alignment */
+    coeff = gmx_simd_align_r(coeff_array);
+    dr    = gmx_simd_align_r(dr_array);
+    f_buf = gmx_simd_align_r(f_buf_array);
+
+    set_pbc_simd(pbc, &pbc_simd);
+
+    one_S = gmx_simd_set1_r(1.0);
+
+    /* The smallest number > -1 */
+    min_one_plus_eps_S = gmx_simd_set1_r(-1.0 + 2*GMX_REAL_EPS);
+
+    /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+    {
+        /* Collect atoms for GMX_SIMD_REAL_WIDTH angles.
+         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+         */
+        iu = i;
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+        {
+            type  = forceatoms[iu];
+            ai[s] = forceatoms[iu+1];
+            aj[s] = forceatoms[iu+2];
+            ak[s] = forceatoms[iu+3];
+
+            coeff[s]                     = forceparams[type].harmonic.krA;
+            coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA*DEG2RAD;
+
+            /* If you can't use pbc_dx_simd below for PBC, e.g. because
+             * you can't round in SIMD, use pbc_rvec_sub here.
+             */
+            /* Store the non PBC corrected distances packed and aligned */
+            for (m = 0; m < DIM; m++)
+            {
+                dr[s +      m *GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
+                dr[s + (DIM+m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
+            }
+
+            /* At the end fill the arrays with identical entries */
+            if (iu + nfa1 < nbonds)
+            {
+                iu += nfa1;
+            }
+        }
+
+        k_S       = gmx_simd_load_r(coeff);
+        theta0_S  = gmx_simd_load_r(coeff+GMX_SIMD_REAL_WIDTH);
+
+        rijx_S    = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
+        rijy_S    = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
+        rijz_S    = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
+        rkjx_S    = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
+        rkjy_S    = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
+        rkjz_S    = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
+
+        pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
+        pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
+
+        rij_rkj_S = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                     rkjx_S, rkjy_S, rkjz_S);
+
+        nrij2_S   = gmx_simd_norm2_r(rijx_S, rijy_S, rijz_S);
+        nrkj2_S   = gmx_simd_norm2_r(rkjx_S, rkjy_S, rkjz_S);
+
+        nrij_1_S  = gmx_simd_invsqrt_r(nrij2_S);
+        nrkj_1_S  = gmx_simd_invsqrt_r(nrkj2_S);
+
+        cos_S     = gmx_simd_mul_r(rij_rkj_S, gmx_simd_mul_r(nrij_1_S, nrkj_1_S));
+
+        /* To allow for 180 degrees, we take the max of cos and -1 + 1bit,
+         * so we can safely get the 1/sin from 1/sqrt(1 - cos^2).
+         * This also ensures that rounding errors would cause the argument
+         * of gmx_simd_acos_r to be < -1.
+         * Note that we do not take precautions for cos(0)=1, so the outer
+         * atoms in an angle should not be on top of each other.
+         */
+        cos_S     = gmx_simd_max_r(cos_S, min_one_plus_eps_S);
+
+        theta_S   = gmx_simd_acos_r(cos_S);
+
+        invsin_S  = gmx_simd_invsqrt_r(gmx_simd_sub_r(one_S, gmx_simd_mul_r(cos_S, cos_S)));
+
+        st_S      = gmx_simd_mul_r(gmx_simd_mul_r(k_S, gmx_simd_sub_r(theta0_S, theta_S)),
+                                   invsin_S);
+        sth_S     = gmx_simd_mul_r(st_S, cos_S);
+
+        cik_S     = gmx_simd_mul_r(st_S,  gmx_simd_mul_r(nrij_1_S, nrkj_1_S));
+        cii_S     = gmx_simd_mul_r(sth_S, gmx_simd_mul_r(nrij_1_S, nrij_1_S));
+        ckk_S     = gmx_simd_mul_r(sth_S, gmx_simd_mul_r(nrkj_1_S, nrkj_1_S));
+
+        f_ix_S    = gmx_simd_mul_r(cii_S, rijx_S);
+        f_ix_S    = gmx_simd_fnmadd_r(cik_S, rkjx_S, f_ix_S);
+        f_iy_S    = gmx_simd_mul_r(cii_S, rijy_S);
+        f_iy_S    = gmx_simd_fnmadd_r(cik_S, rkjy_S, f_iy_S);
+        f_iz_S    = gmx_simd_mul_r(cii_S, rijz_S);
+        f_iz_S    = gmx_simd_fnmadd_r(cik_S, rkjz_S, f_iz_S);
+        f_kx_S    = gmx_simd_mul_r(ckk_S, rkjx_S);
+        f_kx_S    = gmx_simd_fnmadd_r(cik_S, rijx_S, f_kx_S);
+        f_ky_S    = gmx_simd_mul_r(ckk_S, rkjy_S);
+        f_ky_S    = gmx_simd_fnmadd_r(cik_S, rijy_S, f_ky_S);
+        f_kz_S    = gmx_simd_mul_r(ckk_S, rkjz_S);
+        f_kz_S    = gmx_simd_fnmadd_r(cik_S, rijz_S, f_kz_S);
+
+        gmx_simd_store_r(f_buf + 0*GMX_SIMD_REAL_WIDTH, f_ix_S);
+        gmx_simd_store_r(f_buf + 1*GMX_SIMD_REAL_WIDTH, f_iy_S);
+        gmx_simd_store_r(f_buf + 2*GMX_SIMD_REAL_WIDTH, f_iz_S);
+        gmx_simd_store_r(f_buf + 3*GMX_SIMD_REAL_WIDTH, f_kx_S);
+        gmx_simd_store_r(f_buf + 4*GMX_SIMD_REAL_WIDTH, f_ky_S);
+        gmx_simd_store_r(f_buf + 5*GMX_SIMD_REAL_WIDTH, f_kz_S);
+
+        iu = i;
+        s  = 0;
+        do
+        {
+            for (m = 0; m < DIM; m++)
+            {
+                f[ai[s]][m] += f_buf[s + m*GMX_SIMD_REAL_WIDTH];
+                f[aj[s]][m] -= f_buf[s + m*GMX_SIMD_REAL_WIDTH] + f_buf[s + (DIM+m)*GMX_SIMD_REAL_WIDTH];
+                f[ak[s]][m] += f_buf[s + (DIM+m)*GMX_SIMD_REAL_WIDTH];
+            }
+            s++;
+            iu += nfa1;
+        }
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+    }
+}
+
+#endif /* GMX_SIMD_HAVE_REAL */
+
  real linear_angles(int nbonds,
                     const t_iatom forceatoms[], const t_iparams forceparams[],
                     const rvec x[], rvec f[], rvec fshift[],
                     const t_pbc *pbc, const t_graph *g,
                     real lambda, real *dvdlambda,
-                   const t_mdatoms *md, t_fcdata *fcd,
-                   int *global_atom_index)
+                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
  {
      int  i, m, ai, aj, ak, t1, t2, type;
      rvec f_i, f_j, f_k;
@@ -1031,8 +1287,8 @@ real urey_bradley(int nbonds,
                    const rvec x[], rvec f[], rvec fshift[],
                    const t_pbc *pbc, const t_graph *g,
                    real lambda, real *dvdlambda,
-                  const t_mdatoms *md, t_fcdata *fcd,
-                  int *global_atom_index)
+                  const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                  int gmx_unused *global_atom_index)
  {
      int  i, m, ai, aj, ak, t1, t2, type, ki;
      rvec r_ij, r_kj, r_ik;
@@ -1145,9 +1401,9 @@ real quartic_angles(int nbonds,
                      const t_iatom forceatoms[], const t_iparams forceparams[],
                      const rvec x[], rvec f[], rvec fshift[],
                      const t_pbc *pbc, const t_graph *g,
-                    real lambda, real *dvdlambda,
-                    const t_mdatoms *md, t_fcdata *fcd,
-                    int *global_atom_index)
+                    real gmx_unused lambda, real gmx_unused *dvdlambda,
+                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                    int gmx_unused *global_atom_index)
  {
      int  i, j, ai, aj, ak, t1, t2, type;
      rvec r_ij, r_kj;
@@ -1254,161 +1510,136 @@ real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
  }
  
  
-#ifdef SSE_PROPER_DIHEDRALS
-
-/* x86 SIMD inner-product of 4 float vectors */
-#define GMX_MM_IPROD_PS(ax, ay, az, bx, by, bz)                 \
-    _mm_add_ps(_mm_add_ps(_mm_mul_ps(ax, bx), _mm_mul_ps(ay, by)), _mm_mul_ps(az, bz))
-
-/* x86 SIMD norm^2 of 4 float vectors */
-#define GMX_MM_NORM2_PS(ax, ay, az) GMX_MM_IPROD_PS(ax, ay, az, ax, ay, az)
-
-/* x86 SIMD cross-product of 4 float vectors */
-#define GMX_MM_CPROD_PS(ax, ay, az, bx, by, bz, cx, cy, cz)        \
-    {                                                          \
-        cx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by));  \
-        cy = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz));  \
-        cz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx));  \
-    }
-
-/* load 4 rvec's into 3 x86 SIMD float registers */
-#define load_rvec4(r0, r1, r2, r3, rx_SSE, ry_SSE, rz_SSE)          \
-    {                                                             \
-        __m128 tmp;                                               \
-        rx_SSE = _mm_load_ps(r0);                                 \
-        ry_SSE = _mm_load_ps(r1);                                 \
-        rz_SSE = _mm_load_ps(r2);                                 \
-        tmp    = _mm_load_ps(r3);                                 \
-        _MM_TRANSPOSE4_PS(rx_SSE, ry_SSE, rz_SSE, tmp);              \
-    }
-
-#define store_rvec4(rx_SSE, ry_SSE, rz_SSE, r0, r1, r2, r3)         \
-    {                                                             \
-        __m128 tmp = _mm_setzero_ps();                              \
-        _MM_TRANSPOSE4_PS(rx_SSE, ry_SSE, rz_SSE, tmp);              \
-        _mm_store_ps(r0, rx_SSE);                                  \
-        _mm_store_ps(r1, ry_SSE);                                  \
-        _mm_store_ps(r2, rz_SSE);                                  \
-        _mm_store_ps(r3, tmp   );                                  \
-    }
-
-/* An rvec in a structure which can be allocated 16-byte aligned */
-typedef struct {
-    rvec  v;
-    float f;
-} rvec_sse_t;
+#ifdef GMX_SIMD_HAVE_REAL
  
-/* As dih_angle above, but calculates 4 dihedral angles at once using SSE,
+/* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
   * also calculates the pre-factor required for the dihedral force update.
- * Note that bv and buf should be 16-byte aligned.
+ * Note that bv and buf should be register aligned.
   */
-static void
-dih_angle_sse(const rvec *x,
-              int ai[4], int aj[4], int ak[4], int al[4],
-              const t_pbc *pbc,
-              int t1[4], int t2[4], int t3[4],
-              rvec_sse_t *bv,
-              real *buf)
+static gmx_inline void
+dih_angle_simd(const rvec *x,
+               const int *ai, const int *aj, const int *ak, const int *al,
+               const pbc_simd_t *pbc,
+               real *dr,
+               gmx_simd_real_t *phi_S,
+               gmx_simd_real_t *mx_S, gmx_simd_real_t *my_S, gmx_simd_real_t *mz_S,
+               gmx_simd_real_t *nx_S, gmx_simd_real_t *ny_S, gmx_simd_real_t *nz_S,
+               gmx_simd_real_t *nrkj_m2_S,
+               gmx_simd_real_t *nrkj_n2_S,
+               real *p,
+               real *q)
  {
-    int    s;
-    __m128 rijx_SSE, rijy_SSE, rijz_SSE;
-    __m128 rkjx_SSE, rkjy_SSE, rkjz_SSE;
-    __m128 rklx_SSE, rkly_SSE, rklz_SSE;
-    __m128 mx_SSE, my_SSE, mz_SSE;
-    __m128 nx_SSE, ny_SSE, nz_SSE;
-    __m128 cx_SSE, cy_SSE, cz_SSE;
-    __m128 cn_SSE;
-    __m128 s_SSE;
-    __m128 phi_SSE;
-    __m128 ipr_SSE;
-    int    signs;
-    __m128 iprm_SSE, iprn_SSE;
-    __m128 nrkj2_SSE, nrkj_1_SSE, nrkj_2_SSE, nrkj_SSE;
-    __m128 nrkj_m2_SSE, nrkj_n2_SSE;
-    __m128 p_SSE, q_SSE;
-    __m128 fmin_SSE = _mm_set1_ps(GMX_FLOAT_MIN);
-
-    for (s = 0; s < 4; s++)
+    int             s, m;
+    gmx_simd_real_t rijx_S, rijy_S, rijz_S;
+    gmx_simd_real_t rkjx_S, rkjy_S, rkjz_S;
+    gmx_simd_real_t rklx_S, rkly_S, rklz_S;
+    gmx_simd_real_t cx_S, cy_S, cz_S;
+    gmx_simd_real_t cn_S;
+    gmx_simd_real_t s_S;
+    gmx_simd_real_t ipr_S;
+    gmx_simd_real_t iprm_S, iprn_S;
+    gmx_simd_real_t nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
+    gmx_simd_real_t toler_S;
+    gmx_simd_real_t p_S, q_S;
+    gmx_simd_real_t nrkj2_min_S;
+    gmx_simd_real_t real_eps_S;
+
+    /* Used to avoid division by zero.
+     * We take into acount that we multiply the result by real_eps_S.
+     */
+    nrkj2_min_S = gmx_simd_set1_r(GMX_REAL_MIN/(2*GMX_REAL_EPS));
+
+    /* The value of the last significant bit (GMX_REAL_EPS is half of that) */
+    real_eps_S  = gmx_simd_set1_r(2*GMX_REAL_EPS);
+
+    for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
      {
-        t1[s] = pbc_rvec_sub(pbc, x[ai[s]], x[aj[s]], bv[0+s].v);
-        t2[s] = pbc_rvec_sub(pbc, x[ak[s]], x[aj[s]], bv[4+s].v);
-        t3[s] = pbc_rvec_sub(pbc, x[ak[s]], x[al[s]], bv[8+s].v);
+        /* If you can't use pbc_dx_simd below for PBC, e.g. because
+         * you can't round in SIMD, use pbc_rvec_sub here.
+         */
+        for (m = 0; m < DIM; m++)
+        {
+            dr[s + (0*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
+            dr[s + (1*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
+            dr[s + (2*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[al[s]][m];
+        }
      }
  
-    load_rvec4(bv[0].v, bv[1].v, bv[2].v, bv[3].v, rijx_SSE, rijy_SSE, rijz_SSE);
-    load_rvec4(bv[4].v, bv[5].v, bv[6].v, bv[7].v, rkjx_SSE, rkjy_SSE, rkjz_SSE);
-    load_rvec4(bv[8].v, bv[9].v, bv[10].v, bv[11].v, rklx_SSE, rkly_SSE, rklz_SSE);
+    rijx_S = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
+    rijy_S = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
+    rijz_S = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
+    rkjx_S = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
+    rkjy_S = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
+    rkjz_S = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
+    rklx_S = gmx_simd_load_r(dr + 6*GMX_SIMD_REAL_WIDTH);
+    rkly_S = gmx_simd_load_r(dr + 7*GMX_SIMD_REAL_WIDTH);
+    rklz_S = gmx_simd_load_r(dr + 8*GMX_SIMD_REAL_WIDTH);
  
-    GMX_MM_CPROD_PS(rijx_SSE, rijy_SSE, rijz_SSE,
-                    rkjx_SSE, rkjy_SSE, rkjz_SSE,
-                    mx_SSE, my_SSE, mz_SSE);
+    pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
+    pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
+    pbc_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc);
  
-    GMX_MM_CPROD_PS(rkjx_SSE, rkjy_SSE, rkjz_SSE,
-                    rklx_SSE, rkly_SSE, rklz_SSE,
-                    nx_SSE, ny_SSE, nz_SSE);
+    gmx_simd_cprod_r(rijx_S, rijy_S, rijz_S,
+                     rkjx_S, rkjy_S, rkjz_S,
+                     mx_S, my_S, mz_S);
  
-    GMX_MM_CPROD_PS(mx_SSE, my_SSE, mz_SSE,
-                    nx_SSE, ny_SSE, nz_SSE,
-                    cx_SSE, cy_SSE, cz_SSE);
+    gmx_simd_cprod_r(rkjx_S, rkjy_S, rkjz_S,
+                     rklx_S, rkly_S, rklz_S,
+                     nx_S, ny_S, nz_S);
  
-    cn_SSE = gmx_mm_sqrt_ps(GMX_MM_NORM2_PS(cx_SSE, cy_SSE, cz_SSE));
+    gmx_simd_cprod_r(*mx_S, *my_S, *mz_S,
+                     *nx_S, *ny_S, *nz_S,
+                     &cx_S, &cy_S, &cz_S);
  
-    s_SSE = GMX_MM_IPROD_PS(mx_SSE, my_SSE, mz_SSE, nx_SSE, ny_SSE, nz_SSE);
+    cn_S       = gmx_simd_sqrt_r(gmx_simd_norm2_r(cx_S, cy_S, cz_S));
  
-    phi_SSE = gmx_mm_atan2_ps(cn_SSE, s_SSE);
-    _mm_store_ps(buf+16, phi_SSE);
+    s_S        = gmx_simd_iprod_r(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
  
-    ipr_SSE = GMX_MM_IPROD_PS(rijx_SSE, rijy_SSE, rijz_SSE,
-                              nx_SSE, ny_SSE, nz_SSE);
+    /* Determine the dihedral angle, the sign might need correction */
+    *phi_S     = gmx_simd_atan2_r(cn_S, s_S);
  
-    signs = _mm_movemask_ps(ipr_SSE);
+    ipr_S      = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                  *nx_S, *ny_S, *nz_S);
  
-    for (s = 0; s < 4; s++)
-    {
-        if (signs & (1<<s))
-        {
-            buf[16+s] = -buf[16+s];
-        }
-    }
-
-    iprm_SSE    = GMX_MM_NORM2_PS(mx_SSE, my_SSE, mz_SSE);
-    iprn_SSE    = GMX_MM_NORM2_PS(nx_SSE, ny_SSE, nz_SSE);
+    iprm_S     = gmx_simd_norm2_r(*mx_S, *my_S, *mz_S);
+    iprn_S     = gmx_simd_norm2_r(*nx_S, *ny_S, *nz_S);
  
-    /* store_rvec4 messes with the input, don't use it after this! */
-    store_rvec4(mx_SSE, my_SSE, mz_SSE, bv[0].v, bv[1].v, bv[2].v, bv[3].v);
-    store_rvec4(nx_SSE, ny_SSE, nz_SSE, bv[4].v, bv[5].v, bv[6].v, bv[7].v);
-
-    nrkj2_SSE   = GMX_MM_NORM2_PS(rkjx_SSE, rkjy_SSE, rkjz_SSE);
+    nrkj2_S    = gmx_simd_norm2_r(rkjx_S, rkjy_S, rkjz_S);
  
      /* Avoid division by zero. When zero, the result is multiplied by 0
       * anyhow, so the 3 max below do not affect the final result.
       */
-    nrkj2_SSE   = _mm_max_ps(nrkj2_SSE, fmin_SSE);
-    nrkj_1_SSE  = gmx_mm_invsqrt_ps(nrkj2_SSE);
-    nrkj_2_SSE  = _mm_mul_ps(nrkj_1_SSE, nrkj_1_SSE);
-    nrkj_SSE    = _mm_mul_ps(nrkj2_SSE, nrkj_1_SSE);
-
-    iprm_SSE    = _mm_max_ps(iprm_SSE, fmin_SSE);
-    iprn_SSE    = _mm_max_ps(iprn_SSE, fmin_SSE);
-    nrkj_m2_SSE = _mm_mul_ps(nrkj_SSE, gmx_mm_inv_ps(iprm_SSE));
-    nrkj_n2_SSE = _mm_mul_ps(nrkj_SSE, gmx_mm_inv_ps(iprn_SSE));
+    nrkj2_S    = gmx_simd_max_r(nrkj2_S, nrkj2_min_S);
+    nrkj_1_S   = gmx_simd_invsqrt_r(nrkj2_S);
+    nrkj_2_S   = gmx_simd_mul_r(nrkj_1_S, nrkj_1_S);
+    nrkj_S     = gmx_simd_mul_r(nrkj2_S, nrkj_1_S);
  
-    _mm_store_ps(buf+0, nrkj_m2_SSE);
-    _mm_store_ps(buf+4, nrkj_n2_SSE);
+    toler_S    = gmx_simd_mul_r(nrkj2_S, real_eps_S);
  
-    p_SSE       = GMX_MM_IPROD_PS(rijx_SSE, rijy_SSE, rijz_SSE,
-                                  rkjx_SSE, rkjy_SSE, rkjz_SSE);
-    p_SSE       = _mm_mul_ps(p_SSE, nrkj_2_SSE);
-
-    q_SSE       = GMX_MM_IPROD_PS(rklx_SSE, rkly_SSE, rklz_SSE,
-                                  rkjx_SSE, rkjy_SSE, rkjz_SSE);
-    q_SSE       = _mm_mul_ps(q_SSE, nrkj_2_SSE);
-
-    _mm_store_ps(buf+8, p_SSE);
-    _mm_store_ps(buf+12, q_SSE);
+    /* Here the plain-C code uses a conditional, but we can't do that in SIMD.
+     * So we take a max with the tolerance instead. Since we multiply with
+     * m or n later, the max does not affect the results.
+     */
+    iprm_S     = gmx_simd_max_r(iprm_S, toler_S);
+    iprn_S     = gmx_simd_max_r(iprn_S, toler_S);
+    *nrkj_m2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprm_S));
+    *nrkj_n2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprn_S));
+
+    /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
+    *phi_S     = gmx_simd_xor_sign_r(*phi_S, ipr_S);
+    p_S        = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
+                                  rkjx_S, rkjy_S, rkjz_S);
+    p_S        = gmx_simd_mul_r(p_S, nrkj_2_S);
+
+    q_S        = gmx_simd_iprod_r(rklx_S, rkly_S, rklz_S,
+                                  rkjx_S, rkjy_S, rkjz_S);
+    q_S        = gmx_simd_mul_r(q_S, nrkj_2_S);
+
+    gmx_simd_store_r(p, p_S);
+    gmx_simd_store_r(q, q_S);
  }
  
-#endif /* SSE_PROPER_DIHEDRALS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  
  void do_dih_fup(int i, int j, int k, int l, real ddphi,
@@ -1520,21 +1751,22 @@ do_dih_fup_noshiftf(int i, int j, int k, int l, real ddphi,
  }
  
  /* As do_dih_fup_noshiftf above, but with pre-calculated pre-factors */
-static void
-do_dih_fup_noshiftf_precalc(int i, int j, int k, int l, real ddphi,
-                            real nrkj_m2, real nrkj_n2,
+static gmx_inline void
+do_dih_fup_noshiftf_precalc(int i, int j, int k, int l,
                              real p, real q,
-                            rvec m, rvec n, rvec f[])
+                            real f_i_x, real f_i_y, real f_i_z,
+                            real mf_l_x, real mf_l_y, real mf_l_z,
+                            rvec f[])
  {
      rvec f_i, f_j, f_k, f_l;
-    rvec uvec, vvec, svec, dx_jl;
-    real a, b, toler;
-    ivec jt, dt_ij, dt_kj, dt_lj;
-
-    a = -ddphi*nrkj_m2;
-    svmul(a, m, f_i);
-    b =  ddphi*nrkj_n2;
-    svmul(b, n, f_l);
+    rvec uvec, vvec, svec;
+
+    f_i[XX] = f_i_x;
+    f_i[YY] = f_i_y;
+    f_i[ZZ] = f_i_z;
+    f_l[XX] = -mf_l_x;
+    f_l[YY] = -mf_l_y;
+    f_l[ZZ] = -mf_l_z;
      svmul(p, f_i, uvec);
      svmul(q, f_l, vvec);
      rvec_sub(uvec, vvec, svec);
@@ -1634,8 +1866,8 @@ real pdihs(int nbonds,
             const rvec x[], rvec f[], rvec fshift[],
             const t_pbc *pbc, const t_graph *g,
             real lambda, real *dvdlambda,
-           const t_mdatoms *md, t_fcdata *fcd,
-           int *global_atom_index)
+           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+           int gmx_unused *global_atom_index)
  {
      int  i, type, ai, aj, ak, al;
      int  t1, t2, t3;
@@ -1693,10 +1925,10 @@ static void
  pdihs_noener(int nbonds,
               const t_iatom forceatoms[], const t_iparams forceparams[],
               const rvec x[], rvec f[],
-             const t_pbc *pbc, const t_graph *g,
+             const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
               real lambda,
-             const t_mdatoms *md, t_fcdata *fcd,
-             int *global_atom_index)
+             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+             int gmx_unused *global_atom_index)
  {
      int  i, type, ai, aj, ak, al;
      int  t1, t2, t3;
@@ -1742,98 +1974,285 @@ pdihs_noener(int nbonds,
  }
  
  
-#ifdef SSE_PROPER_DIHEDRALS
+#ifdef GMX_SIMD_HAVE_REAL
  
-/* As pdihs_noner above, but using SSE to calculate 4 dihedrals at once */
+/* As pdihs_noner above, but using SIMD to calculate many dihedrals at once */
  static void
-pdihs_noener_sse(int nbonds,
-                 const t_iatom forceatoms[], const t_iparams forceparams[],
-                 const rvec x[], rvec f[],
-                 const t_pbc *pbc, const t_graph *g,
-                 real lambda,
-                 const t_mdatoms *md, t_fcdata *fcd,
-                 int *global_atom_index)
+pdihs_noener_simd(int nbonds,
+                  const t_iatom forceatoms[], const t_iparams forceparams[],
+                  const rvec x[], rvec f[],
+                  const t_pbc *pbc, const t_graph gmx_unused *g,
+                  real gmx_unused lambda,
+                  const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                  int gmx_unused *global_atom_index)
  {
-    int        i, i4, s;
-    int        type, ai[4], aj[4], ak[4], al[4];
-    int        t1[4], t2[4], t3[4];
-    int        mult[4];
-    real       cp[4], mdphi[4];
-    real       ddphi;
-    rvec_sse_t rs_array[13], *rs;
-    real       buf_array[24], *buf;
-    __m128     mdphi_SSE, sin_SSE, cos_SSE;
-
-    /* Ensure 16-byte alignment */
-    rs  = (rvec_sse_t *)(((size_t)(rs_array +1)) & (~((size_t)15)));
-    buf =      (float *)(((size_t)(buf_array+3)) & (~((size_t)15)));
-
-    for (i = 0; (i < nbonds); i += 20)
+    const int             nfa1 = 5;
+    int                   i, iu, s;
+    int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
+    real                  ddphi;
+    real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                  buf_array[7*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
+    real                 *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
+    gmx_simd_real_t       phi0_S, phi_S;
+    gmx_simd_real_t       mx_S, my_S, mz_S;
+    gmx_simd_real_t       nx_S, ny_S, nz_S;
+    gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+    gmx_simd_real_t       cp_S, mdphi_S, mult_S;
+    gmx_simd_real_t       sin_S, cos_S;
+    gmx_simd_real_t       mddphi_S;
+    gmx_simd_real_t       sf_i_S, msf_l_S;
+    pbc_simd_t            pbc_simd;
+
+    /* Ensure SIMD register alignment */
+    dr  = gmx_simd_align_r(dr_array);
+    buf = gmx_simd_align_r(buf_array);
+
+    /* Extract aligned pointer for parameters and variables */
+    cp    = buf + 0*GMX_SIMD_REAL_WIDTH;
+    phi0  = buf + 1*GMX_SIMD_REAL_WIDTH;
+    mult  = buf + 2*GMX_SIMD_REAL_WIDTH;
+    p     = buf + 3*GMX_SIMD_REAL_WIDTH;
+    q     = buf + 4*GMX_SIMD_REAL_WIDTH;
+    sf_i  = buf + 5*GMX_SIMD_REAL_WIDTH;
+    msf_l = buf + 6*GMX_SIMD_REAL_WIDTH;
+
+    set_pbc_simd(pbc, &pbc_simd);
+
+    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
      {
-        /* Collect atoms quadruplets for 4 dihedrals */
-        i4 = i;
-        for (s = 0; s < 4; s++)
-        {
-            ai[s] = forceatoms[i4+1];
-            aj[s] = forceatoms[i4+2];
-            ak[s] = forceatoms[i4+3];
-            al[s] = forceatoms[i4+4];
+        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+         */
+        iu = i;
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+        {
+            type  = forceatoms[iu];
+            ai[s] = forceatoms[iu+1];
+            aj[s] = forceatoms[iu+2];
+            ak[s] = forceatoms[iu+3];
+            al[s] = forceatoms[iu+4];
+
+            cp[s]   = forceparams[type].pdihs.cpA;
+            phi0[s] = forceparams[type].pdihs.phiA*DEG2RAD;
+            mult[s] = forceparams[type].pdihs.mult;
+
              /* At the end fill the arrays with identical entries */
-            if (i4 + 5 < nbonds)
+            if (iu + nfa1 < nbonds)
              {
-                i4 += 5;
+                iu += nfa1;
              }
          }
  
-        /* Caclulate 4 dihedral angles at once */
-        dih_angle_sse(x, ai, aj, ak, al, pbc, t1, t2, t3, rs, buf);
+        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+        dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+                       dr,
+                       &phi_S,
+                       &mx_S, &my_S, &mz_S,
+                       &nx_S, &ny_S, &nz_S,
+                       &nrkj_m2_S,
+                       &nrkj_n2_S,
+                       p, q);
+
+        cp_S     = gmx_simd_load_r(cp);
+        phi0_S   = gmx_simd_load_r(phi0);
+        mult_S   = gmx_simd_load_r(mult);
+
+        mdphi_S  = gmx_simd_sub_r(gmx_simd_mul_r(mult_S, phi_S), phi0_S);
+
+        /* Calculate GMX_SIMD_REAL_WIDTH sines at once */
+        gmx_simd_sincos_r(mdphi_S, &sin_S, &cos_S);
+        mddphi_S = gmx_simd_mul_r(gmx_simd_mul_r(cp_S, mult_S), sin_S);
+        sf_i_S   = gmx_simd_mul_r(mddphi_S, nrkj_m2_S);
+        msf_l_S  = gmx_simd_mul_r(mddphi_S, nrkj_n2_S);
+
+        /* After this m?_S will contain f[i] */
+        mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+        my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+        mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
+
+        /* After this m?_S will contain -f[l] */
+        nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+        ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+        nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
+
+        gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+        gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+        gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+        gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+        gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+        gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+
+        iu = i;
+        s  = 0;
+        do
+        {
+            do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+                                        p[s], q[s],
+                                        dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
+                                        f);
+            s++;
+            iu += nfa1;
+        }
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+    }
+}
  
-        i4 = i;
-        for (s = 0; s < 4; s++)
+/* This is mostly a copy of pdihs_noener_simd above, but with using
+ * the RB potential instead of a harmonic potential.
+ * This function can replace rbdihs() when no energy and virial are needed.
+ */
+static void
+rbdihs_noener_simd(int nbonds,
+                   const t_iatom forceatoms[], const t_iparams forceparams[],
+                   const rvec x[], rvec f[],
+                   const t_pbc *pbc, const t_graph gmx_unused *g,
+                   real gmx_unused lambda,
+                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
+{
+    const int             nfa1 = 5;
+    int                   i, iu, s, j;
+    int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
+    real                  ddphi;
+    real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                  buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
+    real                 *parm, *phi, *p, *q, *sf_i, *msf_l;
+
+    gmx_simd_real_t       phi_S;
+    gmx_simd_real_t       ddphi_S, cosfac_S;
+    gmx_simd_real_t       mx_S, my_S, mz_S;
+    gmx_simd_real_t       nx_S, ny_S, nz_S;
+    gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+    gmx_simd_real_t       parm_S, c_S;
+    gmx_simd_real_t       sin_S, cos_S;
+    gmx_simd_real_t       sf_i_S, msf_l_S;
+    pbc_simd_t            pbc_simd;
+
+    gmx_simd_real_t       pi_S  = gmx_simd_set1_r(M_PI);
+    gmx_simd_real_t       one_S = gmx_simd_set1_r(1.0);
+
+    /* Ensure SIMD register alignment */
+    dr  = gmx_simd_align_r(dr_array);
+    buf = gmx_simd_align_r(buf_array);
+
+    /* Extract aligned pointer for parameters and variables */
+    parm  = buf;
+    p     = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH;
+    q     = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH;
+    sf_i  = buf + (NR_RBDIHS + 2)*GMX_SIMD_REAL_WIDTH;
+    msf_l = buf + (NR_RBDIHS + 3)*GMX_SIMD_REAL_WIDTH;
+
+    set_pbc_simd(pbc, &pbc_simd);
+
+    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+    {
+        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+         */
+        iu = i;
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
          {
-            if (i4 < nbonds)
+            type  = forceatoms[iu];
+            ai[s] = forceatoms[iu+1];
+            aj[s] = forceatoms[iu+2];
+            ak[s] = forceatoms[iu+3];
+            al[s] = forceatoms[iu+4];
+
+            /* We don't need the first parameter, since that's a constant
+             * which only affects the energies, not the forces.
+             */
+            for (j = 1; j < NR_RBDIHS; j++)
              {
-                /* Calculate the coefficient and angle deviation */
-                type = forceatoms[i4];
-                dopdihs_mdphi(forceparams[type].pdihs.cpA,
-                              forceparams[type].pdihs.cpB,
-                              forceparams[type].pdihs.phiA,
-                              forceparams[type].pdihs.phiB,
-                              forceparams[type].pdihs.mult,
-                              buf[16+s], lambda, &cp[s], &buf[16+s]);
-                mult[s] = forceparams[type].pdihs.mult;
+                parm[j*GMX_SIMD_REAL_WIDTH + s] =
+                    forceparams[type].rbdihs.rbcA[j];
              }
-            else
+
+            /* At the end fill the arrays with identical entries */
+            if (iu + nfa1 < nbonds)
              {
-                buf[16+s] = 0;
+                iu += nfa1;
              }
-            i4 += 5;
          }
  
-        /* Calculate 4 sines at once */
-        mdphi_SSE = _mm_load_ps(buf+16);
-        gmx_mm_sincos_ps(mdphi_SSE, &sin_SSE, &cos_SSE);
-        _mm_store_ps(buf+16, sin_SSE);
+        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+        dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+                       dr,
+                       &phi_S,
+                       &mx_S, &my_S, &mz_S,
+                       &nx_S, &ny_S, &nz_S,
+                       &nrkj_m2_S,
+                       &nrkj_n2_S,
+                       p, q);
  
-        i4 = i;
+        /* Change to polymer convention */
+        phi_S = gmx_simd_sub_r(phi_S, pi_S);
+
+        gmx_simd_sincos_r(phi_S, &sin_S, &cos_S);
+
+        ddphi_S   = gmx_simd_setzero_r();
+        c_S       = one_S;
+        cosfac_S  = one_S;
+        for (j = 1; j < NR_RBDIHS; j++)
+        {
+            parm_S   = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH);
+            ddphi_S  = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S);
+            cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S);
+            c_S      = gmx_simd_add_r(c_S, one_S);
+        }
+
+        /* Note that here we do not use the minus sign which is present
+         * in the normal RB code. This is corrected for through (m)sf below.
+         */
+        ddphi_S  = gmx_simd_mul_r(ddphi_S, sin_S);
+
+        sf_i_S   = gmx_simd_mul_r(ddphi_S, nrkj_m2_S);
+        msf_l_S  = gmx_simd_mul_r(ddphi_S, nrkj_n2_S);
+
+        /* After this m?_S will contain f[i] */
+        mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+        my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+        mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
+
+        /* After this m?_S will contain -f[l] */
+        nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+        ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+        nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
+
+        gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+        gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+        gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+        gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+        gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+        gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+
+        iu = i;
          s  = 0;
          do
          {
-            ddphi = -cp[s]*mult[s]*buf[16+s];
-
-            do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s], ddphi,
-                                        buf[ 0+s], buf[ 4+s],
-                                        buf[ 8+s], buf[12+s],
-                                        rs[0+s].v, rs[4+s].v,
+            do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+                                        p[s], q[s],
+                                        dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
                                          f);
              s++;
-            i4 += 5;
+            iu += nfa1;
          }
-        while (s < 4 && i4 < nbonds);
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
      }
  }
  
-#endif /* SSE_PROPER_DIHEDRALS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  
  real idihs(int nbonds,
@@ -1841,8 +2260,8 @@ real idihs(int nbonds,
             const rvec x[], rvec f[], rvec fshift[],
             const t_pbc *pbc, const t_graph *g,
             real lambda, real *dvdlambda,
-           const t_mdatoms *md, t_fcdata *fcd,
-           int *global_atom_index)
+           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+           int gmx_unused *global_atom_index)
  {
      int  i, type, ai, aj, ak, al;
      int  t1, t2, t3;
@@ -1939,6 +2358,7 @@ static void posres_dx(const rvec x, const rvec pos0A, const rvec pos0B,
                      /* Box relative coordinates are stored for dimensions with pbc */
                      posA *= pbc->box[m][m];
                      posB *= pbc->box[m][m];
+                    assert(npbcdim <= DIM);
                      for (d = m+1; d < npbcdim; d++)
                      {
                          posA += pos0A[d]*pbc->box[d][m];
@@ -2003,6 +2423,7 @@ real fbposres(int nbonds,
          clear_rvec(com_sc);
          for (m = 0; m < npbcdim; m++)
          {
+            assert(npbcdim <= DIM);
              for (d = m; d < npbcdim; d++)
              {
                  com_sc[m] += com[d]*pbc->box[d][m];
@@ -2128,6 +2549,7 @@ real posres(int nbonds,
          clear_rvec(comB_sc);
          for (m = 0; m < npbcdim; m++)
          {
+            assert(npbcdim <= DIM);
              for (d = m; d < npbcdim; d++)
              {
                  comA_sc[m] += comA[d]*pbc->box[d][m];
@@ -2158,7 +2580,7 @@ real posres(int nbonds,
              vtot       += 0.5*kk*dx[m]*dx[m];
              *dvdlambda +=
                  0.5*(pr->posres.fcB[m] - pr->posres.fcA[m])*dx[m]*dx[m]
-                -fm*dpdl[m];
+                + fm*dpdl[m];
  
              /* Here we correct for the pbc_dx which included rdist */
              if (bForceValid)
@@ -2274,8 +2696,8 @@ real angres(int nbonds,
              const rvec x[], rvec f[], rvec fshift[],
              const t_pbc *pbc, const t_graph *g,
              real lambda, real *dvdlambda,
-            const t_mdatoms *md, t_fcdata *fcd,
-            int *global_atom_index)
+            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+            int gmx_unused *global_atom_index)
  {
      return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
                        lambda, dvdlambda, FALSE);
@@ -2286,8 +2708,8 @@ real angresz(int nbonds,
               const rvec x[], rvec f[], rvec fshift[],
               const t_pbc *pbc, const t_graph *g,
               real lambda, real *dvdlambda,
-             const t_mdatoms *md, t_fcdata *fcd,
-             int *global_atom_index)
+             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+             int gmx_unused *global_atom_index)
  {
      return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
                        lambda, dvdlambda, TRUE);
@@ -2298,8 +2720,8 @@ real dihres(int nbonds,
              const rvec x[], rvec f[], rvec fshift[],
              const t_pbc *pbc, const t_graph *g,
              real lambda, real *dvdlambda,
-            const t_mdatoms *md, t_fcdata *fcd,
-            int *global_atom_index)
+            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+            int gmx_unused  *global_atom_index)
  {
      real vtot = 0;
      int  ai, aj, ak, al, i, k, type, t1, t2, t3;
@@ -2388,26 +2810,346 @@ real dihres(int nbonds,
  }
  
  
-real unimplemented(int nbonds,
-                   const t_iatom forceatoms[], const t_iparams forceparams[],
-                   const rvec x[], rvec f[], rvec fshift[],
-                   const t_pbc *pbc, const t_graph *g,
-                   real lambda, real *dvdlambda,
-                   const t_mdatoms *md, t_fcdata *fcd,
-                   int *global_atom_index)
+real unimplemented(int gmx_unused nbonds,
+                   const t_iatom gmx_unused forceatoms[], const t_iparams gmx_unused forceparams[],
+                   const rvec gmx_unused x[], rvec gmx_unused f[], rvec gmx_unused fshift[],
+                   const t_pbc gmx_unused *pbc, const t_graph  gmx_unused *g,
+                   real gmx_unused lambda, real gmx_unused *dvdlambda,
+                   const t_mdatoms  gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
  {
      gmx_impl("*** you are using a not implemented function");
  
      return 0.0; /* To make the compiler happy */
  }
  
+real restrangles(int nbonds,
+                 const t_iatom forceatoms[], const t_iparams forceparams[],
+                 const rvec x[], rvec f[], rvec fshift[],
+                 const t_pbc *pbc, const t_graph *g,
+                 real gmx_unused lambda, real gmx_unused *dvdlambda,
+                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                 int gmx_unused *global_atom_index)
+{
+    int  i, d, ai, aj, ak, type, m;
+    int  t1, t2;
+    rvec r_ij, r_kj;
+    real v, vtot;
+    ivec jt, dt_ij, dt_kj;
+    rvec f_i, f_j, f_k;
+    real prefactor, ratio_ante, ratio_post;
+    rvec delta_ante, delta_post, vec_temp;
+
+    vtot = 0.0;
+    for (i = 0; (i < nbonds); )
+    {
+        type = forceatoms[i++];
+        ai   = forceatoms[i++];
+        aj   = forceatoms[i++];
+        ak   = forceatoms[i++];
+
+        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_post);
+
+
+        /* This function computes factors needed for restricted angle potential.
+         * The restricted angle potential is used in coarse-grained simulations to avoid singularities
+         * when three particles align and the dihedral angle and dihedral potential
+         * cannot be calculated. This potential is calculated using the formula:
+           real restrangles(int nbonds,
+            const t_iatom forceatoms[],const t_iparams forceparams[],
+            const rvec x[],rvec f[],rvec fshift[],
+            const t_pbc *pbc,const t_graph *g,
+            real gmx_unused lambda,real gmx_unused *dvdlambda,
+            const t_mdatoms gmx_unused *md,t_fcdata gmx_unused *fcd,
+            int gmx_unused *global_atom_index)
+           {
+           int  i, d, ai, aj, ak, type, m;
+           int t1, t2;
+           rvec r_ij,r_kj;
+           real v, vtot;
+           ivec jt,dt_ij,dt_kj;
+           rvec f_i, f_j, f_k;
+           real prefactor, ratio_ante, ratio_post;
+           rvec delta_ante, delta_post, vec_temp;
+
+           vtot = 0.0;
+           for(i=0; (i<nbonds); )
+           {
+           type = forceatoms[i++];
+           ai   = forceatoms[i++];
+           aj   = forceatoms[i++];
+           ak   = forceatoms[i++];
+
+         * \f[V_{\rm ReB}(\theta_i) = \frac{1}{2} k_{\theta} \frac{(\cos\theta_i - \cos\theta_0)^2}
+         * {\sin^2\theta_i}\f] ({eq:ReB} and ref \cite{MonicaGoga2013} from the manual).
+         * For more explanations see comments file "restcbt.h". */
+
+        compute_factors_restangles(type, forceparams,  delta_ante, delta_post,
+                                   &prefactor, &ratio_ante, &ratio_post, &v);
+
+        /*   Forces are computed per component */
+        for (d = 0; d < DIM; d++)
+        {
+            f_i[d] = prefactor * (ratio_ante * delta_ante[d] - delta_post[d]);
+            f_j[d] = prefactor * ((ratio_post + 1.0) * delta_post[d] - (ratio_ante + 1.0) * delta_ante[d]);
+            f_k[d] = prefactor * (delta_ante[d] - ratio_post * delta_post[d]);
+        }
+
+        /*   Computation of potential energy   */
+
+        vtot += v;
+
+        /*   Update forces */
+
+        for (m = 0; (m < DIM); m++)
+        {
+            f[ai][m] += f_i[m];
+            f[aj][m] += f_j[m];
+            f[ak][m] += f_k[m];
+        }
+
+        if (g)
+        {
+            copy_ivec(SHIFT_IVEC(g, aj), jt);
+            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+            t1 = IVEC2IS(dt_ij);
+            t2 = IVEC2IS(dt_kj);
+        }
+
+        rvec_inc(fshift[t1], f_i);
+        rvec_inc(fshift[CENTRAL], f_j);
+        rvec_inc(fshift[t2], f_k);
+    }
+    return vtot;
+}
+
+
+real restrdihs(int nbonds,
+               const t_iatom forceatoms[], const t_iparams forceparams[],
+               const rvec x[], rvec f[], rvec fshift[],
+               const t_pbc *pbc, const t_graph *g,
+               real gmx_unused lambda, real gmx_unused *dvlambda,
+               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+               int gmx_unused *global_atom_index)
+{
+    int  i, d, type, ai, aj, ak, al;
+    rvec f_i, f_j, f_k, f_l;
+    rvec dx_jl;
+    ivec jt, dt_ij, dt_kj, dt_lj;
+    int  t1, t2, t3;
+    real v, vtot;
+    rvec delta_ante,  delta_crnt, delta_post, vec_temp;
+    real factor_phi_ai_ante, factor_phi_ai_crnt, factor_phi_ai_post;
+    real factor_phi_aj_ante, factor_phi_aj_crnt, factor_phi_aj_post;
+    real factor_phi_ak_ante, factor_phi_ak_crnt, factor_phi_ak_post;
+    real factor_phi_al_ante, factor_phi_al_crnt, factor_phi_al_post;
+    real prefactor_phi;
+
+
+    vtot = 0.0;
+    for (i = 0; (i < nbonds); )
+    {
+        type = forceatoms[i++];
+        ai   = forceatoms[i++];
+        aj   = forceatoms[i++];
+        ak   = forceatoms[i++];
+        al   = forceatoms[i++];
+
+        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
+        t3 = pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+        pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
+
+        /* This function computes factors needed for restricted angle potential.
+         * The restricted angle potential is used in coarse-grained simulations to avoid singularities
+         * when three particles align and the dihedral angle and dihedral potential cannot be calculated.
+         * This potential is calculated using the formula:
+         * \f[V_{\rm ReB}(\theta_i) = \frac{1}{2} k_{\theta}
+         * \frac{(\cos\theta_i - \cos\theta_0)^2}{\sin^2\theta_i}\f]
+         * ({eq:ReB} and ref \cite{MonicaGoga2013} from the manual).
+         * For more explanations see comments file "restcbt.h" */
+
+        compute_factors_restrdihs(type, forceparams,
+                                  delta_ante, delta_crnt, delta_post,
+                                  &factor_phi_ai_ante, &factor_phi_ai_crnt, &factor_phi_ai_post,
+                                  &factor_phi_aj_ante, &factor_phi_aj_crnt, &factor_phi_aj_post,
+                                  &factor_phi_ak_ante, &factor_phi_ak_crnt, &factor_phi_ak_post,
+                                  &factor_phi_al_ante, &factor_phi_al_crnt, &factor_phi_al_post,
+                                  &prefactor_phi, &v);
+
+
+        /*      Computation of forces per component */
+        for (d = 0; d < DIM; d++)
+        {
+            f_i[d] = prefactor_phi * (factor_phi_ai_ante * delta_ante[d] + factor_phi_ai_crnt * delta_crnt[d] + factor_phi_ai_post * delta_post[d]);
+            f_j[d] = prefactor_phi * (factor_phi_aj_ante * delta_ante[d] + factor_phi_aj_crnt * delta_crnt[d] + factor_phi_aj_post * delta_post[d]);
+            f_k[d] = prefactor_phi * (factor_phi_ak_ante * delta_ante[d] + factor_phi_ak_crnt * delta_crnt[d] + factor_phi_ak_post * delta_post[d]);
+            f_l[d] = prefactor_phi * (factor_phi_al_ante * delta_ante[d] + factor_phi_al_crnt * delta_crnt[d] + factor_phi_al_post * delta_post[d]);
+        }
+        /*      Computation of the energy */
+
+        vtot += v;
+
+
+
+        /*    Updating the forces */
+
+        rvec_inc(f[ai], f_i);
+        rvec_inc(f[aj], f_j);
+        rvec_inc(f[ak], f_k);
+        rvec_inc(f[al], f_l);
+
+
+        /* Updating the fshift forces for the pressure coupling */
+        if (g)
+        {
+            copy_ivec(SHIFT_IVEC(g, aj), jt);
+            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+            ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
+            t1 = IVEC2IS(dt_ij);
+            t2 = IVEC2IS(dt_kj);
+            t3 = IVEC2IS(dt_lj);
+        }
+        else if (pbc)
+        {
+            t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
+        }
+        else
+        {
+            t3 = CENTRAL;
+        }
+
+        rvec_inc(fshift[t1], f_i);
+        rvec_inc(fshift[CENTRAL], f_j);
+        rvec_inc(fshift[t2], f_k);
+        rvec_inc(fshift[t3], f_l);
+
+    }
+
+    return vtot;
+}
+
+
+real cbtdihs(int nbonds,
+             const t_iatom forceatoms[], const t_iparams forceparams[],
+             const rvec x[], rvec f[], rvec fshift[],
+             const t_pbc *pbc, const t_graph *g,
+             real gmx_unused lambda, real gmx_unused *dvdlambda,
+             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+             int gmx_unused *global_atom_index)
+{
+    int  type, ai, aj, ak, al, i, d;
+    int  t1, t2, t3;
+    real v, vtot;
+    rvec vec_temp;
+    rvec f_i, f_j, f_k, f_l;
+    ivec jt, dt_ij, dt_kj, dt_lj;
+    rvec dx_jl;
+    rvec delta_ante, delta_crnt, delta_post;
+    rvec f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al;
+    rvec f_theta_ante_ai, f_theta_ante_aj, f_theta_ante_ak;
+    rvec f_theta_post_aj, f_theta_post_ak, f_theta_post_al;
+
+
+
+
+    vtot = 0.0;
+    for (i = 0; (i < nbonds); )
+    {
+        type = forceatoms[i++];
+        ai   = forceatoms[i++];
+        aj   = forceatoms[i++];
+        ak   = forceatoms[i++];
+        al   = forceatoms[i++];
+
+
+        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], vec_temp);
+        pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
+        t3 = pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+        pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
+
+        /* \brief Compute factors for CBT potential
+         * The combined bending-torsion potential goes to zero in a very smooth manner, eliminating the numerical
+         * instabilities, when three coarse-grained particles align and the dihedral angle and standard
+         * dihedral potentials cannot be calculated. The CBT potential is calculated using the formula:
+         * \f[V_{\rm CBT}(\theta_{i-1}, \theta_i, \phi_i) = k_{\phi} \sin^3\theta_{i-1} \sin^3\theta_{i}
+         * \sum_{n=0}^4 { a_n \cos^n\phi_i}\f] ({eq:CBT} and ref \cite{MonicaGoga2013} from the manual).
+         * It contains in its expression not only the dihedral angle \f$\phi\f$
+         * but also \f[\theta_{i-1}\f] (theta_ante bellow) and \f[\theta_{i}\f] (theta_post bellow)
+         * --- the adjacent bending angles.
+         * For more explanations see comments file "restcbt.h". */
+
+        compute_factors_cbtdihs(type, forceparams, delta_ante, delta_crnt, delta_post,
+                                f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al,
+                                f_theta_ante_ai, f_theta_ante_aj, f_theta_ante_ak,
+                                f_theta_post_aj, f_theta_post_ak, f_theta_post_al,
+                                &v);
+
+
+        /*      Acumulate the resuts per beads */
+        for (d = 0; d < DIM; d++)
+        {
+            f_i[d] = f_phi_ai[d] + f_theta_ante_ai[d];
+            f_j[d] = f_phi_aj[d] + f_theta_ante_aj[d] + f_theta_post_aj[d];
+            f_k[d] = f_phi_ak[d] + f_theta_ante_ak[d] + f_theta_post_ak[d];
+            f_l[d] = f_phi_al[d] + f_theta_post_al[d];
+        }
+
+        /*      Compute the potential energy */
+
+        vtot += v;
+
+
+        /*  Updating the forces */
+        rvec_inc(f[ai], f_i);
+        rvec_inc(f[aj], f_j);
+        rvec_inc(f[ak], f_k);
+        rvec_inc(f[al], f_l);
+
+
+        /* Updating the fshift forces for the pressure coupling */
+        if (g)
+        {
+            copy_ivec(SHIFT_IVEC(g, aj), jt);
+            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+            ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
+            t1 = IVEC2IS(dt_ij);
+            t2 = IVEC2IS(dt_kj);
+            t3 = IVEC2IS(dt_lj);
+        }
+        else if (pbc)
+        {
+            t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
+        }
+        else
+        {
+            t3 = CENTRAL;
+        }
+
+        rvec_inc(fshift[t1], f_i);
+        rvec_inc(fshift[CENTRAL], f_j);
+        rvec_inc(fshift[t2], f_k);
+        rvec_inc(fshift[t3], f_l);
+    }
+
+    return vtot;
+}
+
  real rbdihs(int nbonds,
              const t_iatom forceatoms[], const t_iparams forceparams[],
              const rvec x[], rvec f[], rvec fshift[],
              const t_pbc *pbc, const t_graph *g,
              real lambda, real *dvdlambda,
-            const t_mdatoms *md, t_fcdata *fcd,
-            int *global_atom_index)
+            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+            int gmx_unused *global_atom_index)
  {
      const real c0 = 0.0, c1 = 1.0, c2 = 2.0, c3 = 3.0, c4 = 4.0, c5 = 5.0;
      int        type, ai, aj, ak, al, i, j;
@@ -2549,9 +3291,9 @@ real cmap_dihs(int nbonds,
                 const gmx_cmap_t *cmap_grid,
                 const rvec x[], rvec f[], rvec fshift[],
                 const t_pbc *pbc, const t_graph *g,
-               real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               real gmx_unused lambda, real gmx_unused *dvdlambda,
+               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+               int  gmx_unused *global_atom_index)
  {
      int         i, j, k, n, idx;
      int         ai, aj, ak, al, am;
@@ -2989,8 +3731,8 @@ real g96bonds(int nbonds,
                const rvec x[], rvec f[], rvec fshift[],
                const t_pbc *pbc, const t_graph *g,
                real lambda, real *dvdlambda,
-              const t_mdatoms *md, t_fcdata *fcd,
-              int *global_atom_index)
+              const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+              int gmx_unused *global_atom_index)
  {
      int  i, m, ki, ai, aj, type;
      real dr2, fbond, vbond, fij, vtot;
@@ -3059,8 +3801,8 @@ real g96angles(int nbonds,
                 const rvec x[], rvec f[], rvec fshift[],
                 const t_pbc *pbc, const t_graph *g,
                 real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+               int gmx_unused *global_atom_index)
  {
      int  i, ai, aj, ak, type, m, t1, t2;
      rvec r_ij, r_kj;
@@ -3130,9 +3872,9 @@ real cross_bond_bond(int nbonds,
                       const t_iatom forceatoms[], const t_iparams forceparams[],
                       const rvec x[], rvec f[], rvec fshift[],
                       const t_pbc *pbc, const t_graph *g,
-                     real lambda, real *dvdlambda,
-                     const t_mdatoms *md, t_fcdata *fcd,
-                     int *global_atom_index)
+                     real gmx_unused lambda, real gmx_unused *dvdlambda,
+                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused  *fcd,
+                     int gmx_unused *global_atom_index)
  {
      /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
       * pp. 842-847
@@ -3204,9 +3946,9 @@ real cross_bond_angle(int nbonds,
                        const t_iatom forceatoms[], const t_iparams forceparams[],
                        const rvec x[], rvec f[], rvec fshift[],
                        const t_pbc *pbc, const t_graph *g,
-                      real lambda, real *dvdlambda,
-                      const t_mdatoms *md, t_fcdata *fcd,
-                      int *global_atom_index)
+                      real gmx_unused lambda, real gmx_unused *dvdlambda,
+                      const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                      int gmx_unused *global_atom_index)
  {
      /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
       * pp. 842-847
@@ -3329,8 +4071,8 @@ real tab_bonds(int nbonds,
                 const rvec x[], rvec f[], rvec fshift[],
                 const t_pbc *pbc, const t_graph *g,
                 real lambda, real *dvdlambda,
-               const t_mdatoms *md, t_fcdata *fcd,
-               int *global_atom_index)
+               const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+               int gmx_unused  *global_atom_index)
  {
      int  i, m, ki, ai, aj, type, table;
      real dr, dr2, fbond, vbond, fij, vtot;
@@ -3393,8 +4135,8 @@ real tab_angles(int nbonds,
                  const rvec x[], rvec f[], rvec fshift[],
                  const t_pbc *pbc, const t_graph *g,
                  real lambda, real *dvdlambda,
-                const t_mdatoms *md, t_fcdata *fcd,
-                int *global_atom_index)
+                const t_mdatoms gmx_unused  *md, t_fcdata *fcd,
+                int gmx_unused *global_atom_index)
  {
      int  i, ai, aj, ak, t1, t2, type, table;
      rvec r_ij, r_kj;
@@ -3477,8 +4219,8 @@ real tab_dihs(int nbonds,
                const rvec x[], rvec f[], rvec fshift[],
                const t_pbc *pbc, const t_graph *g,
                real lambda, real *dvdlambda,
-              const t_mdatoms *md, t_fcdata *fcd,
-              int *global_atom_index)
+              const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+              int gmx_unused *global_atom_index)
  {
      int  i, type, ai, aj, ak, al, table;
      int  t1, t2, t3;
@@ -3519,6 +4261,73 @@ real tab_dihs(int nbonds,
      return vtot;
  }
  
+/* Return if this is a potential calculated in bondfree.c,
+ * i.e. an interaction that actually calculates a potential and
+ * works on multiple atoms (not e.g. a connection or a position restraint).
+ */
+static gmx_inline gmx_bool ftype_is_bonded_potential(int ftype)
+{
+    return
+        (interaction_function[ftype].flags & IF_BOND) &&
+        !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES) &&
+        (ftype < F_GB12 || ftype > F_GB14);
+}
+
+static void divide_bondeds_over_threads(t_idef *idef, int nthreads)
+{
+    int ftype;
+    int nat1;
+    int t;
+    int il_nr_thread;
+
+    idef->nthreads = nthreads;
+
+    if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc)
+    {
+        idef->il_thread_division_nalloc = F_NRE*(nthreads+1);
+        snew(idef->il_thread_division, idef->il_thread_division_nalloc);
+    }
+
+    for (ftype = 0; ftype < F_NRE; ftype++)
+    {
+        if (ftype_is_bonded_potential(ftype))
+        {
+            nat1 = interaction_function[ftype].nratoms + 1;
+
+            for (t = 0; t <= nthreads; t++)
+            {
+                /* Divide the interactions equally over the threads.
+                 * When the different types of bonded interactions
+                 * are distributed roughly equally over the threads,
+                 * this should lead to well localized output into
+                 * the force buffer on each thread.
+                 * If this is not the case, a more advanced scheme
+                 * (not implemented yet) will do better.
+                 */
+                il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1;
+
+                /* Ensure that distance restraint pairs with the same label
+                 * end up on the same thread.
+                 * This is slighlty tricky code, since the next for iteration
+                 * may have an initial il_nr_thread lower than the final value
+                 * in the previous iteration, but this will anyhow be increased
+                 * to the approriate value again by this while loop.
+                 */
+                while (ftype == F_DISRES &&
+                       il_nr_thread > 0 &&
+                       il_nr_thread < idef->il[ftype].nr &&
+                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label ==
+                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label)
+                {
+                    il_nr_thread += nat1;
+                }
+
+                idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread;
+            }
+        }
+    }
+}
+
  static unsigned
  calc_bonded_reduction_mask(const t_idef *idef,
                             int shift,
@@ -3531,9 +4340,7 @@ calc_bonded_reduction_mask(const t_idef *idef,
  
      for (ftype = 0; ftype < F_NRE; ftype++)
      {
-        if (interaction_function[ftype].flags & IF_BOND &&
-            !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
-            (ftype<F_GB12 || ftype>F_GB14))
+        if (ftype_is_bonded_potential(ftype))
          {
              nb = idef->il[ftype].nr;
              if (nb > 0)
@@ -3543,8 +4350,8 @@ calc_bonded_reduction_mask(const t_idef *idef,
                  /* Divide this interaction equally over the threads.
                   * This is not stored: should match division in calc_bonds.
                   */
-                nb0 = (((nb/nat1)* t   )/nt)*nat1;
-                nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
+                nb0 = idef->il_thread_division[ftype*(nt+1)+t];
+                nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
  
                  for (i = nb0; i < nb1; i += nat1)
                  {
@@ -3560,14 +4367,18 @@ calc_bonded_reduction_mask(const t_idef *idef,
      return mask;
  }
  
-void init_bonded_thread_force_reduction(t_forcerec   *fr,
-                                        const t_idef *idef)
+void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
  {
  #define MAX_BLOCK_BITS 32
      int t;
      int ctot, c, b;
  
-    if (fr->nthreads <= 1)
+    assert(fr->nthreads >= 1);
+
+    /* Divide the bonded interaction over the threads */
+    divide_bondeds_over_threads(idef, fr->nthreads);
+
+    if (fr->nthreads == 1)
      {
          fr->red_nblock = 0;
  
@@ -3791,14 +4602,14 @@ static real calc_one_bond(FILE *fplog, int thread,
                            rvec x[], rvec f[], rvec fshift[],
                            t_forcerec *fr,
                            const t_pbc *pbc, const t_graph *g,
-                          gmx_enerdata_t *enerd, gmx_grppairener_t *grpp,
+                          gmx_grppairener_t *grpp,
                            t_nrnb *nrnb,
                            real *lambda, real *dvdl,
                            const t_mdatoms *md, t_fcdata *fcd,
                            gmx_bool bCalcEnerVir,
                            int *global_atom_index, gmx_bool bPrintSepPot)
  {
-    int      ind, nat1, nbonds, efptFTYPE;
+    int      nat1, nbonds, efptFTYPE;
      real     v = 0;
      t_iatom *iatoms;
      int      nb0, nbn;
@@ -3812,160 +4623,101 @@ static real calc_one_bond(FILE *fplog, int thread,
          efptFTYPE = efptBONDED;
      }
  
-    if (interaction_function[ftype].flags & IF_BOND &&
-        !(ftype == F_CONNBONDS || ftype == F_POSRES))
-    {
-        ind       = interaction_function[ftype].nrnb_ind;
-        nat1      = interaction_function[ftype].nratoms + 1;
-        nbonds    = idef->il[ftype].nr/nat1;
-        iatoms    = idef->il[ftype].iatoms;
+    nat1      = interaction_function[ftype].nratoms + 1;
+    nbonds    = idef->il[ftype].nr/nat1;
+    iatoms    = idef->il[ftype].iatoms;
  
-        nb0 = ((nbonds* thread   )/(fr->nthreads))*nat1;
-        nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
+    nb0 = idef->il_thread_division[ftype*(idef->nthreads+1)+thread];
+    nbn = idef->il_thread_division[ftype*(idef->nthreads+1)+thread+1] - nb0;
  
-        if (!IS_LISTED_LJ_C(ftype))
+    if (!IS_LISTED_LJ_C(ftype))
+    {
+        if (ftype == F_CMAP)
          {
-            if (ftype == F_CMAP)
-            {
-                v = cmap_dihs(nbn, iatoms+nb0,
-                              idef->iparams, &idef->cmap_grid,
-                              (const rvec*)x, f, fshift,
-                              pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
-                              md, fcd, global_atom_index);
-            }
-            else if (ftype == F_PDIHS &&
-                     !bCalcEnerVir && fr->efep == efepNO)
-            {
-                /* No energies, shift forces, dvdl */
-#ifndef SSE_PROPER_DIHEDRALS
-                pdihs_noener
+            v = cmap_dihs(nbn, iatoms+nb0,
+                          idef->iparams, &idef->cmap_grid,
+                          (const rvec*)x, f, fshift,
+                          pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+                          md, fcd, global_atom_index);
+        }
+#ifdef GMX_SIMD_HAVE_REAL
+        else if (ftype == F_ANGLES &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+            angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                               idef->iparams,
+                               (const rvec*)x, f,
+                               pbc, g, lambda[efptFTYPE], md, fcd,
+                               global_atom_index);
+            v = 0;
+        }
+#endif
+        else if (ftype == F_PDIHS &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+#ifdef GMX_SIMD_HAVE_REAL
+            pdihs_noener_simd
  #else
-                pdihs_noener_sse
+            pdihs_noener
  #endif
-                    (nbn, idef->il[ftype].iatoms+nb0,
-                    idef->iparams,
-                    (const rvec*)x, f,
-                    pbc, g, lambda[efptFTYPE], md, fcd,
-                    global_atom_index);
-                v = 0;
-            }
-            else
-            {
-                v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
-                                                      idef->iparams,
-                                                      (const rvec*)x, f, fshift,
-                                                      pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
-                                                      md, fcd, global_atom_index);
-            }
-            if (bPrintSepPot)
-            {
-                fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        nbonds/nat1, v, lambda[efptFTYPE]);
-            }
+                (nbn, idef->il[ftype].iatoms+nb0,
+                idef->iparams,
+                (const rvec*)x, f,
+                pbc, g, lambda[efptFTYPE], md, fcd,
+                global_atom_index);
+            v = 0;
          }
+#ifdef GMX_SIMD_HAVE_REAL
+        else if (ftype == F_RBDIHS &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+            rbdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                               idef->iparams,
+                               (const rvec*)x, f,
+                               pbc, g, lambda[efptFTYPE], md, fcd,
+                               global_atom_index);
+            v = 0;
+        }
+#endif
          else
          {
-            v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
-                                    pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
-
-            enerd->dvdl_nonlin[efptCOUL] += dvdl[efptCOUL];
-            enerd->dvdl_nonlin[efptVDW]  += dvdl[efptVDW];
-
-            if (bPrintSepPot)
-            {
-                fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        interaction_function[F_LJ14].longname, nbonds/nat1, dvdl[efptVDW]);
-                fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        interaction_function[F_COUL14].longname, nbonds/nat1, dvdl[efptCOUL]);
-            }
+            v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
+                                                  idef->iparams,
+                                                  (const rvec*)x, f, fshift,
+                                                  pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+                                                  md, fcd, global_atom_index);
          }
-        if (ind != -1 && thread == 0)
+        if (bPrintSepPot)
          {
-            inc_nrnb(nrnb, ind, nbonds);
+            fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    nbonds, v, lambda[efptFTYPE]);
          }
      }
-
-    return v;
-}
-
-/* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc
-   function, or horrible things will happen when doing free energy
-   calculations!  In a good coding world, this would not be a
-   different function, but for speed reasons, it needs to be made a
-   separate function.  TODO for 5.0 - figure out a way to reorganize
-   to reduce duplication.
- */
-
-static real calc_one_bond_foreign(FILE *fplog, int ftype, const t_idef *idef,
-                                  rvec x[], rvec f[], t_forcerec *fr,
-                                  const t_pbc *pbc, const t_graph *g,
-                                  gmx_grppairener_t *grpp, t_nrnb *nrnb,
-                                  real *lambda, real *dvdl,
-                                  const t_mdatoms *md, t_fcdata *fcd,
-                                  int *global_atom_index, gmx_bool bPrintSepPot)
-{
-    int      ind, nat1, nbonds, efptFTYPE, nbonds_np;
-    real     v = 0;
-    t_iatom *iatoms;
-
-    if (IS_RESTRAINT_TYPE(ftype))
-    {
-        efptFTYPE = efptRESTRAINT;
-    }
      else
      {
-        efptFTYPE = efptBONDED;
+        v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
+                                pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
+
+        if (bPrintSepPot)
+        {
+            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    interaction_function[F_LJ14].longname, nbonds, dvdl[efptVDW]);
+            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    interaction_function[F_COUL14].longname, nbonds, dvdl[efptCOUL]);
+        }
      }
  
-    if (ftype < F_GB12 || ftype > F_GB14)
+    if (thread == 0)
      {
-        if (interaction_function[ftype].flags & IF_BOND &&
-            !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES))
-        {
-            ind       = interaction_function[ftype].nrnb_ind;
-            nat1      = interaction_function[ftype].nratoms+1;
-            nbonds_np = idef->il[ftype].nr_nonperturbed;
-            nbonds    = idef->il[ftype].nr - nbonds_np;
-            iatoms    = idef->il[ftype].iatoms + nbonds_np;
-            if (nbonds > 0)
-            {
-                if (!IS_LISTED_LJ_C(ftype))
-                {
-                    if (ftype == F_CMAP)
-                    {
-                        v = cmap_dihs(nbonds, iatoms,
-                                      idef->iparams, &idef->cmap_grid,
-                                      (const rvec*)x, f, fr->fshift,
-                                      pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]), md, fcd,
-                                      global_atom_index);
-                    }
-                    else
-                    {
-                        v =     interaction_function[ftype].ifunc(nbonds, iatoms,
-                                                                  idef->iparams,
-                                                                  (const rvec*)x, f, fr->fshift,
-                                                                  pbc, g, lambda[efptFTYPE], &dvdl[efptFTYPE],
-                                                                  md, fcd, global_atom_index);
-                    }
-                }
-                else
-                {
-                    v = do_nonbonded_listed(ftype, nbonds, iatoms,
-                                            idef->iparams,
-                                            (const rvec*)x, f, fr->fshift,
-                                            pbc, g, lambda, dvdl,
-                                            md, fr, grpp, global_atom_index);
-                }
-                if (ind != -1)
-                {
-                    inc_nrnb(nrnb, ind, nbonds/nat1);
-                }
-            }
-        }
+        inc_nrnb(nrnb, interaction_function[ftype].nrnb_ind, nbonds);
      }
+
      return v;
  }
  
@@ -3978,9 +4730,9 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
                  real *lambda,
                  const t_mdatoms *md,
                  t_fcdata *fcd, int *global_atom_index,
-                t_atomtypes *atype, gmx_genborn_t *born,
+                t_atomtypes gmx_unused *atype, gmx_genborn_t gmx_unused *born,
                  int force_flags,
-                gmx_bool bPrintSepPot, gmx_large_int_t step)
+                gmx_bool bPrintSepPot, gmx_int64_t step)
  {
      gmx_bool      bCalcEnerVir;
      int           i;
@@ -3990,6 +4742,8 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
      char          buf[22];
      int           thread;
  
+    assert(fr->nthreads == idef->nthreads);
+
      bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
  
      for (i = 0; i < efptNR; i++)
@@ -4006,7 +4760,7 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
      }
      if (bPrintSepPot)
      {
-        fprintf(fplog, "Step %s: bonded V and dVdl for this node\n",
+        fprintf(fplog, "Step %s: bonded V and dVdl for this rank\n",
                  gmx_step_str(step, buf));
      }
  
@@ -4028,22 +4782,27 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
      }
      if (idef->il[F_DISRES].nr)
      {
-        calc_disres_R_6(ms, idef->il[F_DISRES].nr,
+        calc_disres_R_6(idef->il[F_DISRES].nr,
                          idef->il[F_DISRES].iatoms,
                          idef->iparams, (const rvec*)x, pbc_null,
                          fcd, hist);
+#ifdef GMX_MPI
+        if (fcd->disres.nsystems > 1)
+        {
+            gmx_sum_sim(2*fcd->disres.nres, fcd->disres.Rt_6, ms);
+        }
+#endif
      }
  
  #pragma omp parallel for num_threads(fr->nthreads) schedule(static)
      for (thread = 0; thread < fr->nthreads; thread++)
      {
-        int                ftype, nbonds, ind, nat1;
+        int                ftype;
          real              *epot, v;
          /* thread stuff */
          rvec              *ft, *fshift;
          real              *dvdlt;
          gmx_grppairener_t *grpp;
-        int                nb0, nbn;
  
          if (thread == 0)
          {
@@ -4067,17 +4826,14 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
          /* Loop over all bonded force types to calculate the bonded forces */
          for (ftype = 0; (ftype < F_NRE); ftype++)
          {
-            if (idef->il[ftype].nr > 0 &&
-                (interaction_function[ftype].flags & IF_BOND) &&
-                (ftype < F_GB12 || ftype > F_GB14) &&
-                !(ftype == F_CONNBONDS || ftype == F_POSRES))
+            if (idef->il[ftype].nr > 0 && ftype_is_bonded_potential(ftype))
              {
                  v = calc_one_bond(fplog, thread, ftype, idef, x,
-                                  ft, fshift, fr, pbc_null, g, enerd, grpp,
+                                  ft, fshift, fr, pbc_null, g, grpp,
                                    nrnb, lambda, dvdlt,
                                    md, fcd, bCalcEnerVir,
                                    global_atom_index, bPrintSepPot);
-                epot[ftype]        += v;
+                epot[ftype] += v;
              }
          }
      }
@@ -4117,12 +4873,12 @@ void calc_bonds_lambda(FILE *fplog,
                         t_fcdata *fcd,
                         int *global_atom_index)
  {
-    int           i, ftype, nbonds_np, nbonds, ind, nat;
-    real          v, dr, dr2;
+    int           i, ftype, nr_nonperturbed, nr;
+    real          v;
      real          dvdl_dum[efptNR];
-    rvec         *f, *fshift_orig;
+    rvec         *f, *fshift;
      const  t_pbc *pbc_null;
-    t_iatom      *iatom_fe;
+    t_idef        idef_fe;
  
      if (fr->bMolPBC)
      {
@@ -4133,21 +4889,43 @@ void calc_bonds_lambda(FILE *fplog,
          pbc_null = NULL;
      }
  
+    /* Copy the whole idef, so we can modify the contents locally */
+    idef_fe          = *idef;
+    idef_fe.nthreads = 1;
+    snew(idef_fe.il_thread_division, F_NRE*(idef_fe.nthreads+1));
+
+    /* We already have the forces, so we use temp buffers here */
      snew(f, fr->natoms_force);
-    /* We want to preserve the fshift array in forcerec */
-    fshift_orig = fr->fshift;
-    snew(fr->fshift, SHIFTS);
+    snew(fshift, SHIFTS);
  
-    /* Loop over all bonded force types to calculate the bonded forces */
+    /* Loop over all bonded force types to calculate the bonded energies */
      for (ftype = 0; (ftype < F_NRE); ftype++)
      {
-        v = calc_one_bond_foreign(fplog, ftype, idef, x,
-                                  f, fr, pbc_null, g, grpp, nrnb, lambda, dvdl_dum,
-                                  md, fcd, global_atom_index, FALSE);
-        epot[ftype] += v;
+        if (ftype_is_bonded_potential(ftype))
+        {
+            /* Set the work range of thread 0 to the perturbed bondeds only */
+            nr_nonperturbed                       = idef->il[ftype].nr_nonperturbed;
+            nr                                    = idef->il[ftype].nr;
+            idef_fe.il_thread_division[ftype*2+0] = nr_nonperturbed;
+            idef_fe.il_thread_division[ftype*2+1] = nr;
+
+            /* This is only to get the flop count correct */
+            idef_fe.il[ftype].nr = nr - nr_nonperturbed;
+
+            if (nr - nr_nonperturbed > 0)
+            {
+                v = calc_one_bond(fplog, 0, ftype, &idef_fe,
+                                  x, f, fshift, fr, pbc_null, g,
+                                  grpp, nrnb, lambda, dvdl_dum,
+                                  md, fcd, TRUE,
+                                  global_atom_index, FALSE);
+                epot[ftype] += v;
+            }
+        }
      }
  
-    sfree(fr->fshift);
-    fr->fshift = fshift_orig;
+    sfree(fshift);
      sfree(f);
+
+    sfree(idef_fe.il_thread_division);
  }